From e3600f59e76c3dc08aedfd77629b7c5c48df86af Mon Sep 17 00:00:00 2001
From: Nick Foster
Date: Thu, 20 Jan 2011 16:30:09 -0800
Subject: volk: renamed all files. added all tests. some test things are still
 broken.

---
 volk/include/volk/Makefile.am                      | 128 +++---
 volk/include/volk/make_c.py                        |   2 +-
 volk/include/volk/volk_16i_branch_4_state_8_a16.h  | 194 +++++++++
 volk/include/volk/volk_16i_convert_8i_a16.h        |  69 +++
 volk/include/volk/volk_16i_convert_8i_u.h          |  71 ++++
 volk/include/volk/volk_16i_max_star_16i_a16.h      | 108 +++++
 .../volk/volk_16i_max_star_horizontal_16i_a16.h    | 130 ++++++
 .../volk/volk_16i_permute_and_scalar_add_a16.h     | 139 ++++++
 volk/include/volk/volk_16i_s32f_convert_32f_a16.h  | 119 ++++++
 volk/include/volk/volk_16i_s32f_convert_32f_u.h    | 122 ++++++
 .../volk/volk_16i_x4_quad_max_star_16i_a16.h       | 191 +++++++++
 .../include/volk/volk_16i_x5_add_quad_16i_x4_a16.h | 136 ++++++
 .../volk/volk_16ic_deinterleave_16i_x2_a16.h       | 158 +++++++
 .../volk/volk_16ic_deinterleave_real_16i_a16.h     | 120 ++++++
 .../volk/volk_16ic_deinterleave_real_8i_a16.h      |  94 +++++
 volk/include/volk/volk_16ic_magnitude_16i_a16.h    | 190 +++++++++
 .../volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h  | 108 +++++
 .../volk_16ic_s32f_deinterleave_real_32f_a16.h     | 125 ++++++
 .../volk/volk_16ic_s32f_magnitude_32f_a16.h        | 179 ++++++++
 volk/include/volk/volk_16s_add_quad_a16.h          | 136 ------
 volk/include/volk/volk_16s_branch_4_state_8_a16.h  | 194 ---------
 volk/include/volk/volk_16s_convert_8s_a16.h        |  69 ---
 volk/include/volk/volk_16s_convert_8s_ua16.h       |  71 ----
 volk/include/volk/volk_16s_max_star_16s_a16.h      | 108 -----
 .../volk/volk_16s_max_star_horizontal_16s_a16.h    | 130 ------
 .../volk/volk_16s_permute_and_scalar_add_a16.h     | 139 ------
 volk/include/volk/volk_16s_quad_max_star_16s_a16.h | 191 ---------
 volk/include/volk/volk_16s_s32f_convert_32f_a16.h  | 119 ------
 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h | 122 ------
 .../volk/volk_16sc_deinterleave_16s_16s_a16.h      | 158 -------
 .../volk/volk_16sc_deinterleave_real_16s_a16.h     | 120 ------
 .../volk/volk_16sc_deinterleave_real_8s_a16.h      |  94 -----
 volk/include/volk/volk_16sc_magnitude_16s_a16.h    | 190 ---------
 .../volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h | 108 -----
 .../volk_16sc_s32f_deinterleave_real_32f_a16.h     | 125 ------
 .../volk/volk_16sc_s32f_magnitude_32f_a16.h        | 179 --------
 .../volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h    | 151 -------
 volk/include/volk/volk_32f_32f_add_32f_a16.h       |  81 ----
 volk/include/volk/volk_32f_32f_divide_32f_a16.h    |  82 ----
 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h  | 184 --------
 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h | 184 --------
 .../volk/volk_32f_32f_interleave_32fc_a16.h        |  75 ----
 volk/include/volk/volk_32f_32f_max_32f_a16.h       |  85 ----
 volk/include/volk/volk_32f_32f_min_32f_a16.h       |  85 ----
 volk/include/volk/volk_32f_32f_multiply_32f_a16.h  |  81 ----
 .../volk/volk_32f_32f_s32f_interleave_16sc_a16.h   | 155 -------
 volk/include/volk/volk_32f_32f_subtract_32f_a16.h  |  81 ----
 volk/include/volk/volk_32f_convert_64f_u.h         |  70 +++
 volk/include/volk/volk_32f_convert_64f_ua16.h      |  70 ---
 volk/include/volk/volk_32f_s32f_convert_16i_a16.h  | 110 +++++
 volk/include/volk/volk_32f_s32f_convert_16i_u.h    | 113 +++++
 volk/include/volk/volk_32f_s32f_convert_16s_a16.h  | 110 -----
 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h | 113 -----
 volk/include/volk/volk_32f_s32f_convert_32i_a16.h  | 106 +++++
 volk/include/volk/volk_32f_s32f_convert_32i_u.h    | 109 +++++
 volk/include/volk/volk_32f_s32f_convert_32s_a16.h  | 106 -----
 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h | 109 -----
 volk/include/volk/volk_32f_s32f_convert_8i_a16.h   | 117 ++++++
 volk/include/volk/volk_32f_s32f_convert_8i_u.h     | 120 ++++++
 volk/include/volk/volk_32f_s32f_convert_8s_a16.h   | 117 ------
 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h  | 120 ------
 .../volk/volk_32f_stddev_and_mean_32f_32f_a16.h    | 169 --------
 .../volk/volk_32f_stddev_and_mean_32f_x2_a16.h     | 169 ++++++++
 volk/include/volk/volk_32f_x2_add_32f_a16.h        |  81 ++++
 volk/include/volk/volk_32f_x2_divide_32f_a16.h     |  82 ++++
 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h   | 184 ++++++++
 volk/include/volk/volk_32f_x2_dot_prod_32f_u.h     | 184 ++++++++
 .../include/volk/volk_32f_x2_interleave_32fc_a16.h |  75 ++++
 volk/include/volk/volk_32f_x2_max_32f_a16.h        |  85 ++++
 volk/include/volk/volk_32f_x2_min_32f_a16.h        |  85 ++++
 volk/include/volk/volk_32f_x2_multiply_32f_a16.h   |  81 ++++
 .../volk/volk_32f_x2_s32f_interleave_16ic_a16.h    | 155 +++++++
 volk/include/volk/volk_32f_x2_subtract_32f_a16.h   |  81 ++++
 .../include/volk/volk_32f_x3_sum_of_poly_32f_a16.h | 151 +++++++
 .../volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h   | 344 ---------------
 .../volk/volk_32fc_32fc_dot_prod_32fc_a16.h        | 468 ---------------------
 .../volk/volk_32fc_32fc_multiply_32fc_a16.h        |  95 -----
 ...2fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h | 126 ------
 .../volk/volk_32fc_32fc_square_dist_32f_a16.h      | 112 -----
 .../volk/volk_32fc_deinterleave_32f_32f_a16.h      |  75 ----
 .../volk/volk_32fc_deinterleave_32f_x2_a16.h       |  75 ++++
 .../volk/volk_32fc_deinterleave_64f_64f_a16.h      |  78 ----
 .../volk/volk_32fc_deinterleave_64f_x2_a16.h       |  78 ++++
 .../volk/volk_32fc_deinterleave_real_16i_a16.h     |  80 ++++
 .../volk/volk_32fc_deinterleave_real_16s_a16.h     |  80 ----
 .../volk/volk_32fc_s32f_magnitude_16i_a16.h        | 158 +++++++
 .../volk/volk_32fc_s32f_magnitude_16s_a16.h        | 158 -------
 ...32fc_s32f_s32f_power_spectral_density_32f_a16.h | 134 ------
 ...k_32fc_s32f_x2_power_spectral_density_32f_a16.h | 134 ++++++
 .../volk_32fc_x2_conjugate_dot_prod_32fc_a16.h     | 344 +++++++++++++++
 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h | 468 +++++++++++++++++++++
 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h |  95 +++++
 ..._32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h | 126 ++++++
 .../volk/volk_32fc_x2_square_dist_32f_a16.h        | 112 +++++
 volk/include/volk/volk_32i_s32f_convert_32f_a16.h  |  73 ++++
 volk/include/volk/volk_32i_s32f_convert_32f_u.h    |  75 ++++
 volk/include/volk/volk_32i_x2_and_32i_a16.h        |  81 ++++
 volk/include/volk/volk_32i_x2_or_32i_a16.h         |  81 ++++
 volk/include/volk/volk_32s_32s_and_32s_a16.h       |  81 ----
 volk/include/volk/volk_32s_32s_or_32s_a16.h        |  81 ----
 volk/include/volk/volk_32s_s32f_convert_32f_a16.h  |  73 ----
 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h |  75 ----
 volk/include/volk/volk_64f_64f_max_64f_a16.h       |  71 ----
 volk/include/volk/volk_64f_64f_min_64f_a16.h       |  71 ----
 volk/include/volk/volk_64f_convert_32f_u.h         |  67 +++
 volk/include/volk/volk_64f_convert_32f_ua16.h      |  67 ---
 volk/include/volk/volk_64f_x2_max_64f_a16.h        |  71 ++++
 volk/include/volk/volk_64f_x2_min_64f_a16.h        |  71 ++++
 volk/include/volk/volk_8i_convert_16i_a16.h        |  83 ++++
 volk/include/volk/volk_8i_convert_16i_u.h          |  73 ++++
 volk/include/volk/volk_8i_s32f_convert_32f_a16.h   | 105 +++++
 volk/include/volk/volk_8i_s32f_convert_32f_u.h     |  94 +++++
 .../volk/volk_8ic_deinterleave_16i_x2_a16.h        |  77 ++++
 .../volk/volk_8ic_deinterleave_real_16i_a16.h      |  66 +++
 .../volk/volk_8ic_deinterleave_real_8i_a16.h       |  67 +++
 .../volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h   | 164 ++++++++
 .../volk/volk_8ic_s32f_deinterleave_real_32f_a16.h | 133 ++++++
 .../volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h | 102 +++++
 .../volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h | 122 ++++++
 volk/include/volk/volk_8s_convert_16s_a16.h        |  83 ----
 volk/include/volk/volk_8s_convert_16s_ua16.h       |  73 ----
 volk/include/volk/volk_8s_s32f_convert_32f_a16.h   | 105 -----
 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h  |  94 -----
 .../volk_8sc_8sc_multiply_conjugate_16sc_a16.h     | 102 -----
 ...volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h | 122 ------
 .../volk/volk_8sc_deinterleave_16s_16s_a16.h       |  77 ----
 .../volk/volk_8sc_deinterleave_real_16s_a16.h      |  66 ---
 .../volk/volk_8sc_deinterleave_real_8s_a16.h       |  67 ---
 .../volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h  | 164 --------
 .../volk/volk_8sc_s32f_deinterleave_real_32f_a16.h | 133 ------
 volk/include/volk/volk_register.py                 |   4 +-
 volk/lib/qa_utils.cc                               | 101 +++--
 volk/lib/qa_utils.h                                |   1 +
 volk/orc/Makefile.am                               |  33 +-
 ...k_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc |  12 +
 .../volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc |   5 +
 ...volk_16ic_deinterleave_real_8i_a16_orc_impl.orc |   6 +
 volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc  |  23 +
 ...volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc |   5 -
 ...volk_16sc_deinterleave_real_8s_a16_orc_impl.orc |   6 -
 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc  |  23 -
 .../volk_16sc_magnitude_32f_aligned16_orc_impl.orc |   2 +-
 ...16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc |  12 -
 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc     |   5 -
 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc  |   5 -
 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc     |   5 -
 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc     |   5 -
 .../orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc |   5 -
 .../orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc |   5 -
 volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc   |   5 +
 volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc |   5 +
 volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc |   5 +
 .../volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc  |   6 -
 .../volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc  |  23 +
 .../volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc  |  23 -
 .../volk_32fc_x2_multiply_32fc_a16_orc_impl.orc    |   6 +
 volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc      |   5 +
 volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc       |   5 +
 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc     |   5 -
 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc      |   5 -
 volk/orc/volk_8i_convert_16i_a16_orc_impl.orc      |   5 +
 volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc |   9 +
 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc      |   5 -
 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc |   9 -
 167 files changed, 7962 insertions(+), 7939 deletions(-)
 create mode 100644 volk/include/volk/volk_16i_branch_4_state_8_a16.h
 create mode 100644 volk/include/volk/volk_16i_convert_8i_a16.h
 create mode 100644 volk/include/volk/volk_16i_convert_8i_u.h
 create mode 100644 volk/include/volk/volk_16i_max_star_16i_a16.h
 create mode 100644 volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h
 create mode 100644 volk/include/volk/volk_16i_permute_and_scalar_add_a16.h
 create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_16i_s32f_convert_32f_u.h
 create mode 100644 volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h
 create mode 100644 volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h
 create mode 100644 volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h
 create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h
 create mode 100644 volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
 create mode 100644 volk/include/volk/volk_16ic_magnitude_16i_a16.h
 create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h
 create mode 100644 volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16s_add_quad_a16.h
 delete mode 100644 volk/include/volk/volk_16s_branch_4_state_8_a16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_8s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_convert_8s_ua16.h
 delete mode 100644 volk/include/volk/volk_16s_max_star_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
 delete mode 100644 volk/include/volk/volk_16s_quad_max_star_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_magnitude_16s_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
 delete mode 100644 volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_add_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_divide_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_max_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_min_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_multiply_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
 delete mode 100644 volk/include/volk/volk_32f_32f_subtract_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_convert_64f_u.h
 delete mode 100644 volk/include/volk/volk_32f_convert_64f_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_16i_u.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_a16.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_32i_u.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_a16.h
 create mode 100644 volk/include/volk/volk_32f_s32f_convert_8i_u.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_a16.h
 delete mode 100644 volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
 delete mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_add_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_divide_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
 create mode 100644 volk/include/volk/volk_32f_x2_interleave_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_max_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_min_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_multiply_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h
 create mode 100644 volk/include/volk/volk_32f_x2_subtract_32f_a16.h
 create mode 100644 volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h
 create mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
 delete mode 100644 volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h
 create mode 100644 volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h
 create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_32i_s32f_convert_32f_u.h
 create mode 100644 volk/include/volk/volk_32i_x2_and_32i_a16.h
 create mode 100644 volk/include/volk/volk_32i_x2_or_32i_a16.h
 delete mode 100644 volk/include/volk/volk_32s_32s_and_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32s_32s_or_32s_a16.h
 delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_64f_64f_max_64f_a16.h
 delete mode 100644 volk/include/volk/volk_64f_64f_min_64f_a16.h
 create mode 100644 volk/include/volk/volk_64f_convert_32f_u.h
 delete mode 100644 volk/include/volk/volk_64f_convert_32f_ua16.h
 create mode 100644 volk/include/volk/volk_64f_x2_max_64f_a16.h
 create mode 100644 volk/include/volk/volk_64f_x2_min_64f_a16.h
 create mode 100644 volk/include/volk/volk_8i_convert_16i_a16.h
 create mode 100644 volk/include/volk/volk_8i_convert_16i_u.h
 create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_a16.h
 create mode 100644 volk/include/volk/volk_8i_s32f_convert_32f_u.h
 create mode 100644 volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h
 create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h
 create mode 100644 volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h
 create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h
 create mode 100644 volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h
 create mode 100644 volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8s_convert_16s_ua16.h
 delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_a16.h
 delete mode 100644 volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
 delete mode 100644 volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
 delete mode 100644 volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
 create mode 100644 volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_8i_convert_16i_a16_orc_impl.orc
 create mode 100644 volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
 delete mode 100644 volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc

diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
index aef1d7ba8..43c8ae9df 100644
--- a/volk/include/volk/Makefile.am
+++ b/volk/include/volk/Makefile.am
@@ -41,93 +41,93 @@ volkinclude_HEADERS = \
 	volk.h \
 	volk_cpu.h \
 	volk_environment_init.h \
-	volk_16s_add_quad_a16.h \
-	volk_16s_branch_4_state_8_a16.h \
-	volk_16sc_deinterleave_16s_16s_a16.h \
-	volk_16sc_s32f_deinterleave_32f_32f_a16.h \
-	volk_16sc_deinterleave_real_16s_a16.h \
-	volk_16sc_s32f_deinterleave_real_32f_a16.h \
-	volk_16sc_deinterleave_real_8s_a16.h \
-	volk_16sc_magnitude_16s_a16.h \
-	volk_16sc_s32f_magnitude_32f_a16.h \
-	volk_16s_s32f_convert_32f_a16.h \
-	volk_16s_s32f_convert_32f_ua16.h \
-	volk_16s_convert_8s_a16.h \
-	volk_16s_convert_8s_ua16.h \
-	volk_16s_max_star_16s_a16.h \
-	volk_16s_max_star_horizontal_16s_a16.h \
-	volk_16s_permute_and_scalar_add_a16.h \
-	volk_16s_quad_max_star_16s_a16.h \
+	volk_16i_x5_add_quad_16i_x4_a16.h \
+	volk_16i_branch_4_state_8_a16.h \
+	volk_16ic_deinterleave_16i_x2_a16.h \
+	volk_16ic_s32f_deinterleave_32f_x2_a16.h \
+	volk_16ic_deinterleave_real_16i_a16.h \
+	volk_16ic_s32f_deinterleave_real_32f_a16.h \
+	volk_16ic_deinterleave_real_8i_a16.h \
+	volk_16ic_magnitude_16i_a16.h \
+	volk_16ic_s32f_magnitude_32f_a16.h \
+	volk_16i_s32f_convert_32f_a16.h \
+	volk_16i_s32f_convert_32f_u.h \
+	volk_16i_convert_8i_a16.h \
+	volk_16i_convert_8i_u.h \
+	volk_16i_max_star_16i_a16.h \
+	volk_16i_max_star_horizontal_16i_a16.h \
+	volk_16i_permute_and_scalar_add_a16.h \
+	volk_16i_x4_quad_max_star_16i_a16.h \
 	volk_16u_byteswap_a16.h \
 	volk_32f_accumulator_s32f_a16.h \
-	volk_32f_32f_add_32f_a16.h \
+	volk_32f_x2_add_32f_a16.h \
 	volk_32fc_32f_multiply_32fc_a16.h \
 	volk_32fc_32f_power_32fc_a16.h \
 	volk_32f_calc_spectral_noise_floor_a16.h \
 	volk_32fc_s32f_atan2_32f_a16.h \
-	volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h \
-	volk_32fc_deinterleave_32f_32f_a16.h \
-	volk_32fc_deinterleave_64f_64f_a16.h \
-	volk_32fc_deinterleave_real_16s_a16.h \
+	volk_32fc_x2_conjugate_dot_prod_32fc_a16.h \
+	volk_32fc_deinterleave_32f_x2_a16.h \
+	volk_32fc_deinterleave_64f_x2_a16.h \
+	volk_32fc_deinterleave_real_16i_a16.h \
 	volk_32fc_deinterleave_real_32f_a16.h \
 	volk_32fc_deinterleave_real_64f_a16.h \
-	volk_32fc_32fc_dot_prod_32fc_a16.h \
+	volk_32fc_x2_dot_prod_32fc_a16.h \
 	volk_32fc_index_max_16u_a16.h \
-	volk_32fc_s32f_magnitude_16s_a16.h \
+	volk_32fc_s32f_magnitude_16i_a16.h \
 	volk_32fc_magnitude_32f_a16.h \
-	volk_32fc_32fc_multiply_32fc_a16.h \
-	volk_32f_s32f_convert_16s_a16.h \
-	volk_32f_s32f_convert_16s_ua16.h \
-	volk_32f_s32f_convert_32s_a16.h \
-	volk_32f_s32f_convert_32s_ua16.h \
+	volk_32fc_x2_multiply_32fc_a16.h \
+	volk_32f_s32f_convert_16i_a16.h \
+	volk_32f_s32f_convert_16i_u.h \
+	volk_32f_s32f_convert_32i_a16.h \
+	volk_32f_s32f_convert_32i_u.h \
 	volk_32f_convert_64f_a16.h \
-	volk_32f_convert_64f_ua16.h \
-	volk_32f_s32f_convert_8s_a16.h \
-	volk_32f_s32f_convert_8s_ua16.h \
-	volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h \
+	volk_32f_convert_64f_u.h \
+	volk_32f_s32f_convert_8i_a16.h \
+	volk_32f_s32f_convert_8i_u.h \
+	volk_32fc_s32f_x2_power_spectral_density_32f_a16.h \
 	volk_32fc_s32f_power_spectrum_32f_a16.h \
-	volk_32fc_32fc_square_dist_32f_a16.h \
-	volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h \
-	volk_32f_32f_divide_32f_a16.h \
-	volk_32f_32f_dot_prod_32f_a16.h \
-	volk_32f_32f_dot_prod_32f_ua16.h \
+	volk_32fc_x2_square_dist_32f_a16.h \
+	volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h \
+	volk_32f_x2_divide_32f_a16.h \
+	volk_32f_x2_dot_prod_32f_a16.h \
+	volk_32f_x2_dot_prod_32f_u.h \
 	volk_32f_s32f_32f_fm_detect_32f_a16.h \
 	volk_32f_index_max_16u_a16.h \
-	volk_32f_32f_s32f_interleave_16sc_a16.h \
-	volk_32f_32f_interleave_32fc_a16.h \
-	volk_32f_32f_max_32f_a16.h \
-	volk_32f_32f_min_32f_a16.h \
-	volk_32f_32f_multiply_32f_a16.h \
+	volk_32f_x2_s32f_interleave_16ic_a16.h \
+	volk_32f_x2_interleave_32fc_a16.h \
+	volk_32f_x2_max_32f_a16.h \
+	volk_32f_x2_min_32f_a16.h \
+	volk_32f_x2_multiply_32f_a16.h \
 	volk_32f_s32f_normalize_a16.h \
 	volk_32f_s32f_power_32f_a16.h \
 	volk_32f_sqrt_32f_a16.h \
 	volk_32f_s32f_stddev_32f_a16.h \
-	volk_32f_stddev_and_mean_32f_32f_a16.h \
-	volk_32f_32f_subtract_32f_a16.h \
-	volk_32f_32f_32f_sum_of_poly_32f_a16.h \
-	volk_32s_32s_and_32s_a16.h \
-	volk_32s_s32f_convert_32f_a16.h \
-	volk_32s_s32f_convert_32f_ua16.h \
-	volk_32s_32s_or_32s_a16.h \
+	volk_32f_stddev_and_mean_32f_x2_a16.h \
+	volk_32f_x2_subtract_32f_a16.h \
+	volk_32f_x3_sum_of_poly_32f_a16.h \
+	volk_32i_x2_and_32i_a16.h \
+	volk_32i_s32f_convert_32f_a16.h \
+	volk_32i_s32f_convert_32f_u.h \
+	volk_32i_x2_or_32i_a16.h \
 	volk_32u_byteswap_a16.h \
 	volk_32u_popcnt_a16.h \
 	volk_64f_convert_32f_a16.h \
-	volk_64f_convert_32f_ua16.h \
-	volk_64f_64f_max_64f_a16.h \
-	volk_64f_64f_min_64f_a16.h \
+	volk_64f_convert_32f_u.h \
+	volk_64f_x2_max_64f_a16.h \
+	volk_64f_x2_min_64f_a16.h \
 	volk_64u_byteswap_a16.h \
 	volk_64u_popcnt_a16.h \
-	volk_8sc_deinterleave_16s_16s_a16.h \
-	volk_8sc_s32f_deinterleave_32f_32f_a16.h \
-	volk_8sc_deinterleave_real_16s_a16.h \
-	volk_8sc_s32f_deinterleave_real_32f_a16.h \
-	volk_8sc_deinterleave_real_8s_a16.h \
-	volk_8sc_8sc_multiply_conjugate_16sc_a16.h \
-	volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h \
-	volk_8s_convert_16s_a16.h \
-	volk_8s_convert_16s_ua16.h \
-	volk_8s_s32f_convert_32f_a16.h \
-	volk_8s_s32f_convert_32f_ua16.h 
+	volk_8ic_deinterleave_16i_x2_a16.h \
+	volk_8ic_s32f_deinterleave_32f_x2_a16.h \
+	volk_8ic_deinterleave_real_16i_a16.h \
+	volk_8ic_s32f_deinterleave_real_32f_a16.h \
+	volk_8ic_deinterleave_real_8i_a16.h \
+	volk_8ic_x2_multiply_conjugate_16ic_a16.h \
+	volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h \
+	volk_8i_convert_16i_a16.h \
+	volk_8i_convert_16i_u.h \
+	volk_8i_s32f_convert_32f_a16.h \
+	volk_8i_s32f_convert_32f_u.h 
 
 VOLK_MKTABLES_SOURCES = \
 	$(top_srcdir)/lib/volk_rank_archs.c \
diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py
index f708ba7d0..6e75067d0 100644
--- a/volk/include/volk/make_c.py
+++ b/volk/include/volk/make_c.py
@@ -24,7 +24,7 @@ def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
     tempstring = tempstring + "    }\n"
     tempstring = tempstring + "    return 0;\n"
     tempstring = tempstring + "}\n"
-    
+
     for i in range(len(funclist)): 
         tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
         
diff --git a/volk/include/volk/volk_16i_branch_4_state_8_a16.h b/volk/include/volk/volk_16i_branch_4_state_8_a16.h
new file mode 100644
index 000000000..3437c1a6b
--- /dev/null
+++ b/volk/include/volk/volk_16i_branch_4_state_8_a16.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_volk_16i_branch_4_state_8_a16_H
+#define INCLUDED_volk_16i_branch_4_state_8_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16i_branch_4_state_8_a16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+  
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = 1;
+  
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+  for(; i < bound; ++i) {
+    
+    xmm5 = _mm_load_si128(p_src0);
+    
+    
+    
+    
+    
+    
+    
+
+
+    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+    
+    p_src0 += 4;
+   
+    
+    xmm5 = _mm_add_epi16(xmm1, xmm2);
+    
+    xmm6 = _mm_add_epi16(xmm2, xmm6);
+    xmm8 = _mm_add_epi16(xmm1, xmm8);
+   
+     
+    xmm7 = _mm_load_si128(p_cntl2);
+    xmm9 = _mm_load_si128(p_cntl3);
+    
+    xmm0 = _mm_add_epi16(xmm5, xmm0);
+    
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm5 = _mm_load_si128(&p_cntl2[1]);
+    xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm7);
+   
+   
+ 
+    xmm7 = _mm_load_si128(&p_cntl2[2]);
+    xmm9 = _mm_load_si128(&p_cntl3[2]);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm5);
+   
+    
+    xmm5 = _mm_load_si128(&p_cntl2[3]);
+    xmm11 = _mm_load_si128(&p_cntl3[3]);
+    
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+    
+    xmm8 = _mm_add_epi16(xmm8, xmm7);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    _mm_store_si128(p_target, xmm0);
+    _mm_store_si128(&p_target[1], xmm6);
+
+    xmm10 = _mm_add_epi16(xmm5, xmm10);
+    
+    _mm_store_si128(&p_target[2], xmm8);
+    
+    _mm_store_si128(&p_target[3], xmm10);
+    
+    p_target += 3;   
+  }
+}
+	
+	
+#endif /*LV_HAVE_SSEs*/
+
+#if LV_HAVE_GENERIC
+static inline  void volk_16i_branch_4_state_8_a16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	int i = 0;
+	
+	int bound = 4;
+	
+	for(; i < bound; ++i) {
+	  target[i* 8] = src0[((char)permuters[i][0])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8] & scalars[2])
+	    + (cntl3[i * 8] & scalars[3]);
+	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 1] & scalars[2])
+	    + (cntl3[i * 8 + 1] & scalars[3]);
+	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 2] & scalars[2])
+	    + (cntl3[i * 8 + 2] & scalars[3]);
+	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 3] & scalars[2])
+	    + (cntl3[i * 8 + 3] & scalars[3]);
+	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 4] & scalars[2])
+	    + (cntl3[i * 8 + 4] & scalars[3]);
+	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 5] & scalars[2])
+	    + (cntl3[i * 8 + 5] & scalars[3]);
+	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 6] & scalars[2])
+	    + (cntl3[i * 8 + 6] & scalars[3]);
+	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 7] & scalars[2])
+	    + (cntl3[i * 8 + 7] & scalars[3]);
+	  
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_branch_4_state_8_a16_H*/
diff --git a/volk/include/volk/volk_16i_convert_8i_a16.h b/volk/include/volk/volk_16i_convert_8i_a16.h
new file mode 100644
index 000000000..73e45ad63
--- /dev/null
+++ b/volk/include/volk/volk_16i_convert_8i_a16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_volk_16i_convert_8i_a16_H
+#define INCLUDED_volk_16i_convert_8i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16i_convert_8i_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_a16_H */
diff --git a/volk/include/volk/volk_16i_convert_8i_u.h b/volk/include/volk/volk_16i_convert_8i_u.h
new file mode 100644
index 000000000..5fc792b56
--- /dev/null
+++ b/volk/include/volk/volk_16i_convert_8i_u.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_16i_convert_8i_u_H
+#define INCLUDED_volk_16i_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16i_convert_8i_u_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_u_H */
diff --git a/volk/include/volk/volk_16i_max_star_16i_a16.h b/volk/include/volk/volk_16i_max_star_16i_a16.h
new file mode 100644
index 000000000..ff57bd2a1
--- /dev/null
+++ b/volk/include/volk/volk_16i_max_star_16i_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16i_max_star_16i_a16_H
+#define INCLUDED_volk_16i_max_star_16i_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16i_max_star_16i_a16_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+  
+  short candidate = src0[0];
+  short cands[8];
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+  
+
+  __m128i *p_src0;
+  
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  xmm1 = _mm_setzero_si128();
+  xmm0 = _mm_setzero_si128();
+  //_mm_insert_epi16(xmm0, candidate, 0);
+  
+  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
+
+  
+  for(i = 0; i < bound; ++i) {
+    xmm1 = _mm_load_si128(p_src0);
+    p_src0 += 1;
+    xmm2 = _mm_sub_epi16(xmm1, xmm0);
+  
+
+    
+  
+  
+  
+    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+    xmm6 = _mm_xor_si128(xmm4, xmm5);
+    
+    xmm3 = _mm_and_si128(xmm3, xmm0);
+    xmm4 = _mm_and_si128(xmm6, xmm1);
+    
+    xmm0 = _mm_add_epi16(xmm3, xmm4);
+    
+  
+  }
+  
+  _mm_store_si128((__m128i*)cands, xmm0);
+  
+  for(i = 0; i < 8; ++i) {
+    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+  }
+  
+ 
+  
+  for(i = 0; i < leftovers; ++i) {
+  
+    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+  }
+
+  target[0] = candidate;
+  
+    
+    
+ 
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16i_max_star_16i_a16_generic(short* target, short* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	short candidate = src0[0];
+	for(i = 1; i < bound; ++i) {
+	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+	}
+	target[0] = candidate;
+	  
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_max_star_16i_a16_H*/
diff --git a/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h
new file mode 100644
index 000000000..695e08dbf
--- /dev/null
+++ b/volk/include/volk/volk_16i_max_star_horizontal_16i_a16.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a16_H
+#define INCLUDED_volk_16i_max_star_horizontal_16i_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16i_max_star_horizontal_16i_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+  
+  
+  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
+  __m128i  xmm5, xmm6, xmm7, xmm8;
+  
+  xmm4 = _mm_load_si128((__m128i*)shufmask0);
+  xmm5 = _mm_load_si128((__m128i*)shufmask1);
+  xmm6 = _mm_load_si128((__m128i*)andmask0);
+  xmm7 = _mm_load_si128((__m128i*)andmask1);
+  
+  __m128i *p_target, *p_src0;
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 5;
+  int intermediate = (num_bytes >> 4) & 1;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  for(i = 0; i < bound; ++i) {
+     
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(&p_src0[1]);
+    
+    
+
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 2;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    xmm3 = _mm_and_si128(xmm2, xmm7);
+    
+
+    xmm8 = _mm_add_epi8(xmm8, xmm4);
+    xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+    
+    
+    xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+    
+    _mm_store_si128(p_target, xmm3);
+    
+    p_target += 1;
+  
+  }
+
+  for(i = 0; i < intermediate; ++i) {
+    
+    xmm0 = _mm_load_si128(p_src0);
+    
+    
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 1;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    
+    xmm3 = _mm_add_epi8(xmm8, xmm4);
+    
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+    
+
+    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+    
+    p_target = (__m128i*)((int8_t*)p_target + 8);
+
+  }
+    
+  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
+    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+  }
+  
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16i_max_star_horizontal_16i_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+      
+	for(i = 0; i < bound; i += 2) {
+	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+	}
+		
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a16_H*/
diff --git a/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h
new file mode 100644
index 000000000..e52a949fb
--- /dev/null
+++ b/volk/include/volk/volk_16i_permute_and_scalar_add_a16.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a16_H
+#define INCLUDED_volk_16i_permute_and_scalar_add_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16i_permute_and_scalar_add_a16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+  short* p_permute_indexes = permute_indexes;
+  
+  p_target = (__m128i*)target;
+  p_cntl0 = (__m128i*)cntl0;
+  p_cntl1 = (__m128i*)cntl1;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_setzero_si128();
+    xmm5 = _mm_setzero_si128();
+    xmm6 = _mm_setzero_si128();
+    xmm7 = _mm_setzero_si128();
+
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+    
+    p_permute_indexes += 8;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    xmm5 = _mm_load_si128(p_cntl0);
+    xmm6 = _mm_load_si128(p_cntl1);
+    xmm7 = _mm_load_si128(p_cntl2);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm1);
+    xmm6 = _mm_and_si128(xmm6, xmm2);
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    
+    xmm5 = _mm_load_si128(p_cntl3);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+    p_cntl0 += 1;
+    
+    xmm5 = _mm_and_si128(xmm5, xmm4);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    p_cntl1 += 1;
+    p_cntl2 += 1;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5); 
+    
+    p_cntl3 += 1;
+
+    _mm_store_si128(p_target, xmm0);
+    
+    p_target += 1;
+  }
+	
+	
+	
+	
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target[i] = src0[permute_indexes[i]] 
+      + (cntl0[i] & scalars[0])
+      + (cntl1[i] & scalars[1])
+      + (cntl2[i] & scalars[2])
+      + (cntl3[i] & scalars[3]);
+  }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16i_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target[i] = src0[permute_indexes[i]] 
+			+ (cntl0[i] & scalars[0])
+			+ (cntl1[i] & scalars[1])
+			+ (cntl2[i] & scalars[2])
+			+ (cntl3[i] & scalars[3]);
+		
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a16_H*/
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_a16.h b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h
new file mode 100644
index 000000000..83fd26ff9
--- /dev/null
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_a16.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a16_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16i_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_16i_s32f_convert_32f_u.h b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
new file mode 100644
index 000000000..8f0dd0083
--- /dev/null
+++ b/volk/include/volk/volk_16i_s32f_convert_32f_u.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_u_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h
new file mode 100644
index 000000000..e4ec5ab4e
--- /dev/null
+++ b/volk/include/volk/volk_16i_x4_quad_max_star_16i_a16.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H
+#define INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline  void volk_16i_x4_quad_max_star_16i_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+
+
+
+	int i = 0;
+
+	int bound = (num_bytes >> 4);
+	int bound_copy = bound;
+	int leftovers = (num_bytes >> 1) & 7;
+	
+	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+	p_target = (__m128i*) target;
+	p_src0 =  (__m128i*)src0;
+	p_src1 =  (__m128i*)src1;
+	p_src2 =  (__m128i*)src2;
+	p_src3 =  (__m128i*)src3;
+	
+	
+
+	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+	while(bound_copy > 0) {
+	 
+	  xmm1 = _mm_load_si128(p_src0);
+	  xmm2 = _mm_load_si128(p_src1);
+	  xmm3 = _mm_load_si128(p_src2);
+	  xmm4 = _mm_load_si128(p_src3);
+	  
+	  xmm5 = _mm_setzero_si128();
+	  xmm6 = _mm_setzero_si128();
+	  xmm7 = xmm1;
+	  xmm8 = xmm3;
+	  
+	  
+	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+	  
+
+	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+	  
+
+	  xmm2 = _mm_and_si128(xmm5, xmm2);
+	  xmm4 = _mm_and_si128(xmm6, xmm4);
+	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
+	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+	  xmm5 = _mm_add_epi16(xmm2, xmm5);
+	  xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+	  
+	  xmm1 = _mm_xor_si128(xmm1, xmm1);
+	  xmm2 = xmm5;
+	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
+	  p_src0 += 1;
+	  bound_copy -= 1;
+
+	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+	  p_src1 += 1;
+	  
+	  xmm6 = _mm_and_si128(xmm1, xmm6);
+	  	  
+	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
+	  p_src2 += 1;
+
+
+	  
+	  xmm1 = _mm_add_epi16(xmm6, xmm1);
+	  p_src3 += 1;
+
+	  
+	  _mm_store_si128(p_target, xmm1);
+	  p_target += 1;
+      
+	}
+	
+
+	/*asm volatile
+		(
+		 "volk_16i_x4_quad_max_star_16i_a16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je volk_16i_x4_quad_max_star_16i_a16_sse2_END\n\t"
+
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+
+		 "pxor %%xmm5, %%xmm5\n\t"
+		 "pxor %%xmm6, %%xmm6\n\t"
+		 "movaps %%xmm1, %%xmm7\n\t"
+		 "movaps %%xmm3, %%xmm8\n\t"
+		 "psubw %%xmm2, %%xmm1\n\t"
+		 "psubw %%xmm4, %%xmm3\n\t"
+		 
+		 "pcmpgtw %%xmm1, %%xmm5\n\t"
+		 "pcmpgtw %%xmm3, %%xmm6\n\t"
+		 
+		 "pand %%xmm5, %%xmm2\n\t"
+		 "pand %%xmm6, %%xmm4\n\t"
+		 "pandn %%xmm7, %%xmm5\n\t"
+		 "pandn %%xmm8, %%xmm6\n\t"
+		 
+		 "paddw %%xmm2, %%xmm5\n\t"
+		 "paddw %%xmm4, %%xmm6\n\t"
+
+		 "pxor %%xmm1, %%xmm1\n\t"
+		 "movaps %%xmm5, %%xmm2\n\t"
+		 
+		 "psubw %%xmm6, %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $-1, %[bound]\n\t"
+	  
+		 "pcmpgtw %%xmm5, %%xmm1\n\t"
+		 "add $16, %[src1]\n\t"
+
+		 "pand %%xmm1, %%xmm6\n\t"
+
+		 "pandn %%xmm2, %%xmm1\n\t"
+		 "add $16, %[src2]\n\t"
+
+		 "paddw %%xmm6, %%xmm1\n\t"
+		 "add $16, %[src3]\n\t"
+
+		 "movaps %%xmm1, (%[target])\n\t"
+		 "addw $16, %[target]\n\t"
+		 "jmp volk_16i_x4_quad_max_star_16i_a16_sse2_L1\n\t"
+		 
+		 "volk_16i_x4_quad_max_star_16i_a16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+		 :
+		 );
+	*/ 
+
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+	return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16i_x4_quad_max_star_16i_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+	
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = 0; i < bound; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a16_H*/
diff --git a/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h
new file mode 100644
index 000000000..5744ca3a6
--- /dev/null
+++ b/volk/include/volk/volk_16i_x5_add_quad_16i_x4_a16.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H
+#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16i_x5_add_quad_16i_x4_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+  p_target0 = (__m128i*)target0;
+  p_target1 = (__m128i*)target1;
+  p_target2 = (__m128i*)target2;
+  p_target3 = (__m128i*)target3;
+
+  p_src0 = (__m128i*)src0;
+  p_src1 = (__m128i*)src1;
+  p_src2 = (__m128i*)src2;
+  p_src3 = (__m128i*)src3;
+  p_src4 = (__m128i*)src4;
+
+  int i = 0;
+
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(p_src1);
+    xmm2 = _mm_load_si128(p_src2);
+    xmm3 = _mm_load_si128(p_src3);
+    xmm4 = _mm_load_si128(p_src4);
+    
+    p_src0 += 1;
+    p_src1 += 1;
+    
+    xmm1 = _mm_add_epi16(xmm0, xmm1);
+    xmm2 = _mm_add_epi16(xmm0, xmm2);
+    xmm3 = _mm_add_epi16(xmm0, xmm3);
+    xmm4 = _mm_add_epi16(xmm0, xmm4);
+    
+    
+    p_src2 += 1;
+    p_src3 += 1;
+    p_src4 += 1;
+
+    _mm_store_si128(p_target0, xmm1);
+    _mm_store_si128(p_target1, xmm2);
+    _mm_store_si128(p_target2, xmm3);
+    _mm_store_si128(p_target3, xmm4);
+    
+    p_target0 += 1;
+    p_target1 += 1;
+    p_target2 += 1;
+    p_target3 += 1;
+  }
+    /*asm volatile
+		(
+		 ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END\n\t"
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+		 "movaps (%[src4]), %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $16, %[src1]\n\t"
+		 "add $16, %[src2]\n\t"
+		 "add $16, %[src3]\n\t"
+		 "add $16, %[src4]\n\t"
+		 "paddw %%xmm1, %%xmm2\n\t"
+		 "paddw %%xmm1, %%xmm3\n\t"
+		 "paddw %%xmm1, %%xmm4\n\t"
+		 "paddw %%xmm1, %%xmm5\n\t"
+		 "add $-1, %[bound]\n\t"
+		 "movaps %%xmm2, (%[target0])\n\t"
+		 "movaps %%xmm3, (%[target1])\n\t"
+		 "movaps %%xmm4, (%[target2])\n\t"
+		 "movaps %%xmm5, (%[target3])\n\t"
+		 "add $16, %[target0]\n\t"
+		 "add $16, %[target1]\n\t"
+		 "add $16, %[target2]\n\t"
+		 "add $16, %[target3]\n\t"
+		 "jmp .%=volk_16i_x5_add_quad_16i_x4_a16_sse2_L1\n\t"
+		 ".%=volk_16i_x5_add_quad_16i_x4_a16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+		 );
+		 
+    */
+	 
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target0[i] = src0[i] + src1[i];
+    target1[i] = src0[i] + src2[i];
+    target2[i] = src0[i] + src3[i];
+    target3[i] = src0[i] + src4[i];
+  }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16i_x5_add_quad_16i_x4_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target0[i] = src0[i] + src1[i];
+		target1[i] = src0[i] + src2[i];
+		target2[i] = src0[i] + src3[i];
+		target3[i] = src0[i] + src4[i];
+	}
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a16_H*/
diff --git a/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h
new file mode 100644
index 000000000..7e08bf182
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_16i_x2_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *int16ComplexVectorPtr++;
+    *qBufferPtr++ = *int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_16i_x2_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16ic_deinterleave_16i_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a16_H */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h
new file mode 100644
index 000000000..388c00592
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_real_16i_a16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i complexVal1, complexVal2, iOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
new file mode 100644
index 000000000..437d5ab6b
--- /dev/null
+++ b/volk/include/volk/volk_16ic_deinterleave_real_8i_a16.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a16_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+    complexVal1 = _mm_srai_epi16(complexVal1, 8);
+    complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+    int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+extern void volk_16ic_deinterleave_real_8i_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
+static inline void volk_16ic_deinterleave_real_8i_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16ic_deinterleave_real_8i_a16_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a16_H */
diff --git a/volk/include/volk/volk_16ic_magnitude_16i_a16.h b/volk/include/volk/volk_16ic_magnitude_16i_a16.h
new file mode 100644
index 000000000..bdcace750
--- /dev/null
+++ b/volk/include/volk/volk_16ic_magnitude_16i_a16.h
@@ -0,0 +1,190 @@
+#ifndef INCLUDED_volk_16ic_magnitude_16i_a16_H
+#define INCLUDED_volk_16ic_magnitude_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+   
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float inputFloatBuffer[4] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    cplxValue1 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+    cplxValue2 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float scalar = 32768.0;
+  for(number = 0; number < num_points; number++){
+    float real = ((float)(*complexVectorPtr++)) / scalar;
+    float imag = ((float)(*complexVectorPtr++)) / scalar;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
+static inline void volk_16ic_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+    volk_16ic_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_a16_H */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h
new file mode 100644
index 000000000..606de2fc5
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_32f_x2_a16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t quarterPoints = num_points / 4;    
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[8] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      
+      floatBuffer[0] = (float)(complexVectorPtr[0]);
+      floatBuffer[1] = (float)(complexVectorPtr[1]);
+      floatBuffer[2] = (float)(complexVectorPtr[2]);
+      floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+      floatBuffer[4] = (float)(complexVectorPtr[4]);
+      floatBuffer[5] = (float)(complexVectorPtr[5]);
+      floatBuffer[6] = (float)(complexVectorPtr[6]);
+      floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      _mm_store_ps(iBufferPtr, iValue);
+      _mm_store_ps(qBufferPtr, qValue);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for(; number < num_points; number++){
+      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+extern void volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_deinterleave_32f_x2_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 000000000..62331e496
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+  for(;number < quarterPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi16_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+    sixteenTComplexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar = 1.0/scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h
new file mode 100644
index 000000000..ae64efbeb
--- /dev/null
+++ b/volk/include/volk/volk_16ic_s32f_magnitude_32f_a16.h
@@ -0,0 +1,179 @@
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) / scalar;
+    float val1Imag = (float)(*complexVectorPtr++) / scalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  const float iScalar = 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+
+  __m128 cplxValue1, cplxValue2, result, re, im;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+    
+    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(re, invScalar);
+    cplxValue2 = _mm_mul_ps(im, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) * iScalar;
+    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+
+ 
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16ic_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    float real = ( (float) (*complexVectorPtr++)) * invScalar;
+    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC_DISABLED
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_16ic_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_16ic_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_16ic_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_16s_add_quad_a16.h b/volk/include/volk/volk_16s_add_quad_a16.h
deleted file mode 100644
index 67d0c55a3..000000000
--- a/volk/include/volk/volk_16s_add_quad_a16.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef INCLUDED_volk_16s_add_quad_a16_H
-#define INCLUDED_volk_16s_add_quad_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-
-#if LV_HAVE_SSE2
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline  void volk_16s_add_quad_a16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
-  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
-  p_target0 = (__m128i*)target0;
-  p_target1 = (__m128i*)target1;
-  p_target2 = (__m128i*)target2;
-  p_target3 = (__m128i*)target3;
-
-  p_src0 = (__m128i*)src0;
-  p_src1 = (__m128i*)src1;
-  p_src2 = (__m128i*)src2;
-  p_src3 = (__m128i*)src3;
-  p_src4 = (__m128i*)src4;
-
-  int i = 0;
-
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(p_src1);
-    xmm2 = _mm_load_si128(p_src2);
-    xmm3 = _mm_load_si128(p_src3);
-    xmm4 = _mm_load_si128(p_src4);
-    
-    p_src0 += 1;
-    p_src1 += 1;
-    
-    xmm1 = _mm_add_epi16(xmm0, xmm1);
-    xmm2 = _mm_add_epi16(xmm0, xmm2);
-    xmm3 = _mm_add_epi16(xmm0, xmm3);
-    xmm4 = _mm_add_epi16(xmm0, xmm4);
-    
-    
-    p_src2 += 1;
-    p_src3 += 1;
-    p_src4 += 1;
-
-    _mm_store_si128(p_target0, xmm1);
-    _mm_store_si128(p_target1, xmm2);
-    _mm_store_si128(p_target2, xmm3);
-    _mm_store_si128(p_target3, xmm4);
-    
-    p_target0 += 1;
-    p_target1 += 1;
-    p_target2 += 1;
-    p_target3 += 1;
-  }
-    /*asm volatile
-		(
-		 ".%=volk_16s_add_quad_a16_sse2_L1:\n\t"
-		 "cmp $0, %[bound]\n\t"
-		 "je .%=volk_16s_add_quad_a16_sse2_END\n\t"
-		 "movaps (%[src0]), %%xmm1\n\t"
-		 "movaps (%[src1]), %%xmm2\n\t"
-		 "movaps (%[src2]), %%xmm3\n\t"
-		 "movaps (%[src3]), %%xmm4\n\t"
-		 "movaps (%[src4]), %%xmm5\n\t"
-		 "add $16, %[src0]\n\t"
-		 "add $16, %[src1]\n\t"
-		 "add $16, %[src2]\n\t"
-		 "add $16, %[src3]\n\t"
-		 "add $16, %[src4]\n\t"
-		 "paddw %%xmm1, %%xmm2\n\t"
-		 "paddw %%xmm1, %%xmm3\n\t"
-		 "paddw %%xmm1, %%xmm4\n\t"
-		 "paddw %%xmm1, %%xmm5\n\t"
-		 "add $-1, %[bound]\n\t"
-		 "movaps %%xmm2, (%[target0])\n\t"
-		 "movaps %%xmm3, (%[target1])\n\t"
-		 "movaps %%xmm4, (%[target2])\n\t"
-		 "movaps %%xmm5, (%[target3])\n\t"
-		 "add $16, %[target0]\n\t"
-		 "add $16, %[target1]\n\t"
-		 "add $16, %[target2]\n\t"
-		 "add $16, %[target3]\n\t"
-		 "jmp .%=volk_16s_add_quad_a16_sse2_L1\n\t"
-		 ".%=volk_16s_add_quad_a16_sse2_END:\n\t"
-		 :
-		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
-		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-		 );
-		 
-    */
-	 
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target0[i] = src0[i] + src1[i];
-    target1[i] = src0[i] + src2[i];
-    target2[i] = src0[i] + src3[i];
-    target3[i] = src0[i] + src4[i];
-  }
-}
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_add_quad_a16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	for(i = 0; i < bound; ++i) {
-		target0[i] = src0[i] + src1[i];
-		target1[i] = src0[i] + src2[i];
-		target2[i] = src0[i] + src3[i];
-		target3[i] = src0[i] + src4[i];
-	}
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-
-#endif /*INCLUDED_volk_16s_add_quad_a16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_a16.h b/volk/include/volk/volk_16s_branch_4_state_8_a16.h
deleted file mode 100644
index 4c1af8729..000000000
--- a/volk/include/volk/volk_16s_branch_4_state_8_a16.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef INCLUDED_volk_16s_branch_4_state_8_a16_H
-#define INCLUDED_volk_16s_branch_4_state_8_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_branch_4_state_8_a16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-	
-  
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
-
-  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
-
-  
-  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
-  
-  int i = 0;
-  
-  int bound = 1;
-  
-  
-  xmm0 = _mm_load_si128(p_scalars);
-  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
-  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
-  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
-  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
-
-  for(; i < bound; ++i) {
-    
-    xmm5 = _mm_load_si128(p_src0);
-    
-    
-    
-    
-    
-    
-    
-
-
-    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
-    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
-    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
-    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
-    
-    p_src0 += 4;
-   
-    
-    xmm5 = _mm_add_epi16(xmm1, xmm2);
-    
-    xmm6 = _mm_add_epi16(xmm2, xmm6);
-    xmm8 = _mm_add_epi16(xmm1, xmm8);
-   
-     
-    xmm7 = _mm_load_si128(p_cntl2);
-    xmm9 = _mm_load_si128(p_cntl3);
-    
-    xmm0 = _mm_add_epi16(xmm5, xmm0);
-    
-    
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    xmm9 = _mm_and_si128(xmm9, xmm4);
-    
-    xmm5 = _mm_load_si128(&p_cntl2[1]);
-    xmm11 = _mm_load_si128(&p_cntl3[1]);
-
-    xmm7 = _mm_add_epi16(xmm7, xmm9);
-
-    xmm5 = _mm_and_si128(xmm5, xmm3);
-    xmm11 = _mm_and_si128(xmm11, xmm4);
-
-    xmm0 = _mm_add_epi16(xmm0, xmm7);
-   
-   
- 
-    xmm7 = _mm_load_si128(&p_cntl2[2]);
-    xmm9 = _mm_load_si128(&p_cntl3[2]);
-    
-    xmm5 = _mm_add_epi16(xmm5, xmm11);
-    
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    xmm9 = _mm_and_si128(xmm9, xmm4);
-    
-    xmm6 = _mm_add_epi16(xmm6, xmm5);
-   
-    
-    xmm5 = _mm_load_si128(&p_cntl2[3]);
-    xmm11 = _mm_load_si128(&p_cntl3[3]);
-    
-    xmm7 = _mm_add_epi16(xmm7, xmm9);
-    
-    xmm5 = _mm_and_si128(xmm5, xmm3);
-    xmm11 = _mm_and_si128(xmm11, xmm4);
-    
-    xmm8 = _mm_add_epi16(xmm8, xmm7);
-    
-    xmm5 = _mm_add_epi16(xmm5, xmm11);
-    
-    _mm_store_si128(p_target, xmm0);
-    _mm_store_si128(&p_target[1], xmm6);
-
-    xmm10 = _mm_add_epi16(xmm5, xmm10);
-    
-    _mm_store_si128(&p_target[2], xmm8);
-    
-    _mm_store_si128(&p_target[3], xmm10);
-    
-    p_target += 3;   
-  }
-}
-	
-	
-#endif /*LV_HAVE_SSEs*/
-
-#if LV_HAVE_GENERIC
-static inline  void volk_16s_branch_4_state_8_a16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
-	int i = 0;
-	
-	int bound = 4;
-	
-	for(; i < bound; ++i) {
-	  target[i* 8] = src0[((char)permuters[i][0])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8] & scalars[2])
-	    + (cntl3[i * 8] & scalars[3]);
-	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 1] & scalars[2])
-	    + (cntl3[i * 8 + 1] & scalars[3]);
-	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 2] & scalars[2])
-	    + (cntl3[i * 8 + 2] & scalars[3]);
-	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 3] & scalars[2])
-	    + (cntl3[i * 8 + 3] & scalars[3]);
-	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 4] & scalars[2])
-	    + (cntl3[i * 8 + 4] & scalars[3]);
-	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 5] & scalars[2])
-	    + (cntl3[i * 8 + 5] & scalars[3]);
-	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 6] & scalars[2])
-	    + (cntl3[i * 8 + 6] & scalars[3]);
-	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
-	    + ((i + 1)%2  * scalars[0])
-	    + (((i >> 1)^1) * scalars[1])
-	    + (cntl2[i * 8 + 7] & scalars[2])
-	    + (cntl3[i * 8 + 7] & scalars[3]);
-	  
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_16s_branch_4_state_8_a16_H*/
diff --git a/volk/include/volk/volk_16s_convert_8s_a16.h b/volk/include/volk/volk_16s_convert_8s_a16.h
deleted file mode 100644
index 13db435de..000000000
--- a/volk/include/volk/volk_16s_convert_8s_a16.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef INCLUDED_volk_16s_convert_8s_a16_H
-#define INCLUDED_volk_16s_convert_8s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_a16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-      
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_store_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-*/
-static inline void volk_16s_convert_8s_a16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_ua16.h b/volk/include/volk/volk_16s_convert_8s_ua16.h
deleted file mode 100644
index 9941118ae..000000000
--- a/volk/include/volk/volk_16s_convert_8s_ua16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_volk_16s_convert_8s_ua16_H
-#define INCLUDED_volk_16s_convert_8s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-  \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_ua16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-     int8_t* outputVectorPtr = outputVector;
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal1;
-    __m128i inputVal2;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-
-      // Load the 16 values
-      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
-
-      inputVal1 = _mm_srai_epi16(inputVal1, 8);
-      inputVal2 = _mm_srai_epi16(inputVal2, 8);
-      
-      ret = _mm_packs_epi16(inputVal1, inputVal2);
-
-      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
-
-      outputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] =(int8_t)(inputVector[number] >> 8);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the input 16 bit integer data into 8 bit integer data
-  \param inputVector The 16 bit input data buffer
-  \param outputVector The 8 bit output data buffer
-  \param num_points The number of data values to be converted
-  \note Input and output buffers do NOT need to be properly aligned
-*/
-static inline void volk_16s_convert_8s_ua16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_16s_max_star_16s_a16.h b/volk/include/volk/volk_16s_max_star_16s_a16.h
deleted file mode 100644
index b2ec90552..000000000
--- a/volk/include/volk/volk_16s_max_star_16s_a16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_volk_16s_max_star_16s_a16_H
-#define INCLUDED_volk_16s_max_star_16s_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_max_star_16s_a16_ssse3(short* target, short* src0, unsigned int num_bytes) {
-
-
-  
-  short candidate = src0[0];
-  short cands[8];
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-  
-
-  __m128i *p_src0;
-  
-  p_src0 = (__m128i*)src0;
-
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  int i = 0;
-  
-  
-  xmm1 = _mm_setzero_si128();
-  xmm0 = _mm_setzero_si128();
-  //_mm_insert_epi16(xmm0, candidate, 0);
-  
-  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
-
-  
-  for(i = 0; i < bound; ++i) {
-    xmm1 = _mm_load_si128(p_src0);
-    p_src0 += 1;
-    xmm2 = _mm_sub_epi16(xmm1, xmm0);
-  
-
-    
-  
-  
-  
-    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
-    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
-    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
-
-    xmm6 = _mm_xor_si128(xmm4, xmm5);
-    
-    xmm3 = _mm_and_si128(xmm3, xmm0);
-    xmm4 = _mm_and_si128(xmm6, xmm1);
-    
-    xmm0 = _mm_add_epi16(xmm3, xmm4);
-    
-  
-  }
-  
-  _mm_store_si128((__m128i*)cands, xmm0);
-  
-  for(i = 0; i < 8; ++i) {
-    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
-  }
-  
- 
-  
-  for(i = 0; i < leftovers; ++i) {
-  
-    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
-  }
-
-  target[0] = candidate;
-  
-    
-    
- 
-
-}   
- 
-#endif /*LV_HAVE_SSSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_16s_max_star_16s_a16_generic(short* target, short* src0, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	short candidate = src0[0];
-	for(i = 1; i < bound; ++i) {
-	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
-	}
-	target[0] = candidate;
-	  
-}
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_16s_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h b/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
deleted file mode 100644
index 68994593b..000000000
--- a/volk/include/volk/volk_16s_max_star_horizontal_16s_a16.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
-#define INCLUDED_volk_16s_max_star_horizontal_16s_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-#if LV_HAVE_SSSE3
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-#include<tmmintrin.h>
-
-static inline  void volk_16s_max_star_horizontal_16s_a16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-
-  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
-  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
-
-  
-  
-  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
-  __m128i  xmm5, xmm6, xmm7, xmm8;
-  
-  xmm4 = _mm_load_si128((__m128i*)shufmask0);
-  xmm5 = _mm_load_si128((__m128i*)shufmask1);
-  xmm6 = _mm_load_si128((__m128i*)andmask0);
-  xmm7 = _mm_load_si128((__m128i*)andmask1);
-  
-  __m128i *p_target, *p_src0;
-  
-  p_target = (__m128i*)target;
-  p_src0 = (__m128i*)src0;
-
-  int bound = num_bytes >> 5;
-  int intermediate = (num_bytes >> 4) & 1;
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  int i = 0;
-  
-  
-  for(i = 0; i < bound; ++i) {
-     
-    xmm0 = _mm_load_si128(p_src0);
-    xmm1 = _mm_load_si128(&p_src0[1]);
-    
-    
-
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 2;
-    
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
-
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    xmm3 = _mm_and_si128(xmm2, xmm7);
-    
-
-    xmm8 = _mm_add_epi8(xmm8, xmm4);
-    xmm3 = _mm_add_epi8(xmm3, xmm5);
-
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
-    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
-    
-    
-    xmm3 = _mm_add_epi16(xmm0, xmm1);
-
-    
-    _mm_store_si128(p_target, xmm3);
-    
-    p_target += 1;
-  
-  }
-
-  for(i = 0; i < intermediate; ++i) {
-    
-    xmm0 = _mm_load_si128(p_src0);
-    
-    
-    xmm2 = _mm_xor_si128(xmm2, xmm2);
-    p_src0 += 1;
-    
-    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
-    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
-
-    xmm8 = _mm_and_si128(xmm2, xmm6);
-    
-    xmm3 = _mm_add_epi8(xmm8, xmm4);
-    
-    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
-    
-
-    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
-    
-    p_target = (__m128i*)((int8_t*)p_target + 8);
-
-  }
-    
-  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
-    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
-  }
-  
-
-}   
- 
-#endif /*LV_HAVE_SSSE3*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_max_star_horizontal_16s_a16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-      
-	for(i = 0; i < bound; i += 2) {
-	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
-	}
-		
-}
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_volk_16s_max_star_horizontal_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
deleted file mode 100644
index 2e7586b57..000000000
--- a/volk/include/volk/volk_16s_permute_and_scalar_add_a16.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef INCLUDED_volk_16s_permute_and_scalar_add_a16_H
-#define INCLUDED_volk_16s_permute_and_scalar_add_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include<xmmintrin.h>
-#include<emmintrin.h>
-
-static inline  void volk_16s_permute_and_scalar_add_a16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-	
-
-  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
-
-  short* p_permute_indexes = permute_indexes;
-  
-  p_target = (__m128i*)target;
-  p_cntl0 = (__m128i*)cntl0;
-  p_cntl1 = (__m128i*)cntl1;
-  p_cntl2 = (__m128i*)cntl2;
-  p_cntl3 = (__m128i*)cntl3;
-  p_scalars = (__m128i*)scalars;
-  
-  int i = 0;
-  
-  int bound = (num_bytes >> 4);
-  int leftovers = (num_bytes >> 1) & 7;
-  
-  xmm0 = _mm_load_si128(p_scalars);
-  
-  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
-  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
-  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
-  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
-  
-  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
-  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
-  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
-  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
-
-
-  for(; i < bound; ++i) {
-    xmm0 = _mm_setzero_si128();
-    xmm5 = _mm_setzero_si128();
-    xmm6 = _mm_setzero_si128();
-    xmm7 = _mm_setzero_si128();
-
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
-    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
-    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
-    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
-    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
-
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
-    
-    p_permute_indexes += 8;
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
-    
-    xmm5 = _mm_load_si128(p_cntl0);
-    xmm6 = _mm_load_si128(p_cntl1);
-    xmm7 = _mm_load_si128(p_cntl2);
-    
-    xmm5 = _mm_and_si128(xmm5, xmm1);
-    xmm6 = _mm_and_si128(xmm6, xmm2);
-    xmm7 = _mm_and_si128(xmm7, xmm3);
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm5);
-    
-    xmm5 = _mm_load_si128(p_cntl3);
-    
-    xmm6 = _mm_add_epi16(xmm6, xmm7);
-
-    p_cntl0 += 1;
-    
-    xmm5 = _mm_and_si128(xmm5, xmm4);
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm6);
-    
-    p_cntl1 += 1;
-    p_cntl2 += 1;
-    
-    xmm0 = _mm_add_epi16(xmm0, xmm5); 
-    
-    p_cntl3 += 1;
-
-    _mm_store_si128(p_target, xmm0);
-    
-    p_target += 1;
-  }
-	
-	
-	
-	
-
-  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-    target[i] = src0[permute_indexes[i]] 
-      + (cntl0[i] & scalars[0])
-      + (cntl1[i] & scalars[1])
-      + (cntl2[i] & scalars[2])
-      + (cntl3[i] & scalars[3]);
-  }
-}
-#endif /*LV_HAVE_SSEs*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_permute_and_scalar_add_a16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-
-	for(i = 0; i < bound; ++i) {
-		target[i] = src0[permute_indexes[i]] 
-			+ (cntl0[i] & scalars[0])
-			+ (cntl1[i] & scalars[1])
-			+ (cntl2[i] & scalars[2])
-			+ (cntl3[i] & scalars[3]);
-		
-	}
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_16s_permute_and_scalar_add_a16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h b/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
deleted file mode 100644
index 3e89ff963..000000000
--- a/volk/include/volk/volk_16s_quad_max_star_16s_a16.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#ifndef INCLUDED_volk_16s_quad_max_star_16s_a16_H
-#define INCLUDED_volk_16s_quad_max_star_16s_a16_H
-
-
-#include<inttypes.h>
-#include<stdio.h>	
-
-
-
-
-
-#if LV_HAVE_SSE2
-
-#include<emmintrin.h>
-
-static inline  void volk_16s_quad_max_star_16s_a16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-	
-
-
-
-	int i = 0;
-
-	int bound = (num_bytes >> 4);
-	int bound_copy = bound;
-	int leftovers = (num_bytes >> 1) & 7;
-	
-	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
-	p_target = (__m128i*) target;
-	p_src0 =  (__m128i*)src0;
-	p_src1 =  (__m128i*)src1;
-	p_src2 =  (__m128i*)src2;
-	p_src3 =  (__m128i*)src3;
-	
-	
-
-	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
-	while(bound_copy > 0) {
-	 
-	  xmm1 = _mm_load_si128(p_src0);
-	  xmm2 = _mm_load_si128(p_src1);
-	  xmm3 = _mm_load_si128(p_src2);
-	  xmm4 = _mm_load_si128(p_src3);
-	  
-	  xmm5 = _mm_setzero_si128();
-	  xmm6 = _mm_setzero_si128();
-	  xmm7 = xmm1;
-	  xmm8 = xmm3;
-	  
-	  
-	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
-
-	  
-
-	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
-
-	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
-	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
-
-	  
-
-	  xmm2 = _mm_and_si128(xmm5, xmm2);
-	  xmm4 = _mm_and_si128(xmm6, xmm4);
-	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
-	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
-
-	  xmm5 = _mm_add_epi16(xmm2, xmm5);
-	  xmm6 = _mm_add_epi16(xmm4, xmm6);
-
-	  
-	  xmm1 = _mm_xor_si128(xmm1, xmm1);
-	  xmm2 = xmm5;
-	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
-	  p_src0 += 1;
-	  bound_copy -= 1;
-
-	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
-	  p_src1 += 1;
-	  
-	  xmm6 = _mm_and_si128(xmm1, xmm6);
-	  	  
-	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
-	  p_src2 += 1;
-
-
-	  
-	  xmm1 = _mm_add_epi16(xmm6, xmm1);
-	  p_src3 += 1;
-
-	  
-	  _mm_store_si128(p_target, xmm1);
-	  p_target += 1;
-      
-	}
-	
-
-	/*asm volatile
-		(
-		 "volk_16s_quad_max_star_16s_a16_sse2_L1:\n\t"
-		 "cmp $0, %[bound]\n\t"
-		 "je volk_16s_quad_max_star_16s_a16_sse2_END\n\t"
-
-		 "movaps (%[src0]), %%xmm1\n\t"
-		 "movaps (%[src1]), %%xmm2\n\t"
-		 "movaps (%[src2]), %%xmm3\n\t"
-		 "movaps (%[src3]), %%xmm4\n\t"
-
-		 "pxor %%xmm5, %%xmm5\n\t"
-		 "pxor %%xmm6, %%xmm6\n\t"
-		 "movaps %%xmm1, %%xmm7\n\t"
-		 "movaps %%xmm3, %%xmm8\n\t"
-		 "psubw %%xmm2, %%xmm1\n\t"
-		 "psubw %%xmm4, %%xmm3\n\t"
-		 
-		 "pcmpgtw %%xmm1, %%xmm5\n\t"
-		 "pcmpgtw %%xmm3, %%xmm6\n\t"
-		 
-		 "pand %%xmm5, %%xmm2\n\t"
-		 "pand %%xmm6, %%xmm4\n\t"
-		 "pandn %%xmm7, %%xmm5\n\t"
-		 "pandn %%xmm8, %%xmm6\n\t"
-		 
-		 "paddw %%xmm2, %%xmm5\n\t"
-		 "paddw %%xmm4, %%xmm6\n\t"
-
-		 "pxor %%xmm1, %%xmm1\n\t"
-		 "movaps %%xmm5, %%xmm2\n\t"
-		 
-		 "psubw %%xmm6, %%xmm5\n\t"
-		 "add $16, %[src0]\n\t"
-		 "add $-1, %[bound]\n\t"
-	  
-		 "pcmpgtw %%xmm5, %%xmm1\n\t"
-		 "add $16, %[src1]\n\t"
-
-		 "pand %%xmm1, %%xmm6\n\t"
-
-		 "pandn %%xmm2, %%xmm1\n\t"
-		 "add $16, %[src2]\n\t"
-
-		 "paddw %%xmm6, %%xmm1\n\t"
-		 "add $16, %[src3]\n\t"
-
-		 "movaps %%xmm1, (%[target])\n\t"
-		 "addw $16, %[target]\n\t"
-		 "jmp volk_16s_quad_max_star_16s_a16_sse2_L1\n\t"
-		 
-		 "volk_16s_quad_max_star_16s_a16_sse2_END:\n\t"
-		 :
-		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
-		 :
-		 );
-	*/ 
-
-	short temp0 = 0;
-	short temp1 = 0;
-	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
-	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-	}
-	return;
-
-
-}
-
-#endif /*LV_HAVE_SSE2*/
-
-
-#if LV_HAVE_GENERIC
-static inline void volk_16s_quad_max_star_16s_a16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
-	
-	int i = 0;
-	
-	int bound = num_bytes >> 1;
-	
-	short temp0 = 0;
-	short temp1 = 0;
-	for(i = 0; i < bound; ++i) {
-	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
-	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
-	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
-	}
-}
-
-
-
-
-#endif /*LV_HAVE_GENERIC*/
-
-#endif /*INCLUDED_volk_16s_quad_max_star_16s_a16_H*/
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h b/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
deleted file mode 100644
index 8f9b44478..000000000
--- a/volk/include/volk/volk_16s_s32f_convert_32f_a16.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef INCLUDED_volk_16s_s32f_convert_32f_a16_H
-#define INCLUDED_volk_16s_s32f_convert_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-    
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-      
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_s32f_convert_32f_a16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-      
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_16s_s32f_convert_32f_a16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
deleted file mode 100644
index ad52aea1a..000000000
--- a/volk/include/volk/volk_16s_s32f_convert_32f_ua16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_volk_16s_s32f_convert_32f_ua16_H
-#define INCLUDED_volk_16s_s32f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int eighthPoints = num_points / 8;
-    
-     float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128i inputVal;
-    __m128i inputVal2;
-    __m128 ret;
-
-    for(;number < eighthPoints; number++){
-
-      // Load the 8 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      // Shift the input data to the right by 64 bits ( 8 bytes )
-      inputVal2 = _mm_srli_si128(inputVal, 8);
-
-      // Convert the lower 4 values into 32 bit words
-      inputVal = _mm_cvtepi16_epi32(inputVal);
-      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
-      
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      ret = _mm_cvtepi32_ps(inputVal2);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-
-      inputPtr += 8;
-    }
-
-    number = eighthPoints * 8;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_s32f_convert_32f_ua16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-    float* outputVectorPtr = outputVector;
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* inputPtr = (int16_t*)inputVector;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
-      
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      inputPtr += 4;
-      outputVectorPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 16 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_16s_s32f_convert_32f_ua16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int16_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
deleted file mode 100644
index 8e5da24ec..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_16s_16s_a16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
-#define INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_16s_a16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
-  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
-    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *int16ComplexVectorPtr++;
-    *qBufferPtr++ = *int16ComplexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_16s_a16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
-  unsigned int eighthPoints = num_points / 8;
- 
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
-
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_16s_16s_a16_orc_impl(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_16s_16s_a16_orc(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_deinterleave_16s_16s_a16_orc_impl(iBuffer, qBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
deleted file mode 100644
index 068c1350c..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_16s_a16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
-#define INCLUDED_volk_16sc_deinterleave_real_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_a16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-
-  __m128i complexVal1, complexVal2, iOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
-    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_a16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i complexVal1, complexVal2, iOutputVal;
-  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
-  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
-
-  unsigned int eighthPoints = num_points / 8;
- 
-  for(number = 0; number < eighthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
-
-    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
-
-    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
-
-    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
deleted file mode 100644
index afa21ebc4..000000000
--- a/volk/include/volk/volk_16sc_deinterleave_real_8s_a16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
-#define INCLUDED_volk_16sc_deinterleave_real_8s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
-
-    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
-
-    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
-    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
-
-    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
-
-
-    complexVal1 = _mm_srai_epi16(complexVal1, 8);
-    complexVal3 = _mm_srai_epi16(complexVal3, 8);
-
-    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
-    int16ComplexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (int16_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-extern void volk_16sc_deinterleave_real_8s_a16_orc_impl(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points);
-static inline void volk_16sc_deinterleave_real_8s_a16_orc(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_deinterleave_real_8s_a16_orc_impl(iBuffer, complexVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_deinterleave_real_8s_a16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_a16.h b/volk/include/volk/volk_16sc_magnitude_16s_a16.h
deleted file mode 100644
index d832de5fe..000000000
--- a/volk/include/volk/volk_16sc_magnitude_16s_a16.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef INCLUDED_volk_16sc_magnitude_16s_a16_H
-#define INCLUDED_volk_16sc_magnitude_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-   
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(32768.0);
-  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    result = _mm_mul_ps(result, vScalar); // Scale the results
-
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
-    *magnitudeVectorPtr++ = (int16_t)(val1Result);
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(32768.0);
-  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
-
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
-  float inputFloatBuffer[4] __attribute__((aligned(128)));
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    cplxValue1 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-
-    cplxValue2 = _mm_load_ps(inputFloatBuffer);
-    complexVectorPtr += 4;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    result = _mm_mul_ps(result, vScalar); // Scale the results
-
-    _mm_store_ps(outputFloatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
-    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
-    *magnitudeVectorPtr++ = (int16_t)(val1Result);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float scalar = 32768.0;
-  for(number = 0; number < num_points; number++){
-    float real = ((float)(*complexVectorPtr++)) / scalar;
-    float imag = ((float)(*complexVectorPtr++)) / scalar;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_16sc_t* complexVector, float scalar, unsigned int num_points);
-static inline void volk_16sc_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
-    volk_16sc_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, 32768.0, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
deleted file mode 100644
index 53e4253c4..000000000
--- a/volk/include/volk/volk_16sc_s32f_deinterleave_32f_32f_a16.h
+++ /dev/null
@@ -1,108 +0,0 @@
-#ifndef INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
-#define INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    float* iBufferPtr = iBuffer;
-    float* qBufferPtr = qBuffer;
-
-    uint64_t number = 0;
-    const uint64_t quarterPoints = num_points / 4;    
-    __m128 cplxValue1, cplxValue2, iValue, qValue;
-
-    __m128 invScalar = _mm_set_ps1(1.0/scalar);
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    float floatBuffer[8] __attribute__((aligned(128)));
-
-    for(;number < quarterPoints; number++){
-      
-      floatBuffer[0] = (float)(complexVectorPtr[0]);
-      floatBuffer[1] = (float)(complexVectorPtr[1]);
-      floatBuffer[2] = (float)(complexVectorPtr[2]);
-      floatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-      floatBuffer[4] = (float)(complexVectorPtr[4]);
-      floatBuffer[5] = (float)(complexVectorPtr[5]);
-      floatBuffer[6] = (float)(complexVectorPtr[6]);
-      floatBuffer[7] = (float)(complexVectorPtr[7]);
-
-      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
-      complexVectorPtr += 8;
-
-      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-      // Arrange in i1i2i3i4 format
-      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-      // Arrange in q1q2q3q4 format
-      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-      _mm_store_ps(iBufferPtr, iValue);
-      _mm_store_ps(qBufferPtr, qValue);
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)&complexVector[number];
-    for(; number < num_points; number++){
-      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
-    \param complexVector The complex input vector
-    \param iBuffer The I buffer output data
-    \param qBuffer The Q buffer output data
-    \param scalar The data value to be divided against each input data value of the input complex vector
-    \param num_points The number of complex data values to be deinterleaved
-  */
-extern void volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_s32f_deinterleave_32f_32f_a16_orc(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl(iBuffer, qBuffer, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
deleted file mode 100644
index 7320db368..000000000
--- a/volk/include/volk/volk_16sc_s32f_deinterleave_real_32f_a16.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
-#define INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-
-  __m128 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
-
-  for(;number < quarterPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm_cvtepi16_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
-    sixteenTComplexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 iValue;
-
-  const float iScalar = 1.0/scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-
-    iValue = _mm_load_ps(floatBuffer);
-
-    iValue = _mm_mul_ps(iValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 16 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_16sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_16sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h b/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
deleted file mode 100644
index 649b5cc96..000000000
--- a/volk/include/volk/volk_16sc_s32f_magnitude_32f_a16.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
-#define INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_s32f_magnitude_32f_a16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-  
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    _mm_store_ps(magnitudeVectorPtr, result);
-      
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) / scalar;
-    float val1Imag = (float)(*complexVectorPtr++) / scalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_s32f_magnitude_32f_a16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-
-  const float iScalar = 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-
-  __m128 cplxValue1, cplxValue2, result, re, im;
-
-  float inputFloatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
-    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
-    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
-    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
-    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
-    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
-    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
-    
-    re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
-    im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(re, invScalar);
-    cplxValue2 = _mm_mul_ps(im, invScalar);
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result); // Square root the values
-
-    _mm_store_ps(magnitudeVectorPtr, result);
-      
-    magnitudeVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  complexVectorPtr = (const int16_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    float val1Real = (float)(*complexVectorPtr++) * iScalar;
-    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
-    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
-  }
-}
-
- 
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_16sc_s32f_magnitude_32f_a16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
-  float* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    float real = ( (float) (*complexVectorPtr++)) * invScalar;
-    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
-    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC_DISABLED
-/*!
-  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param magnitudeVector The vector containing the real output values
-  \param scalar The data value to be divided against each input data value of the input complex vector
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_16sc_s32f_magnitude_32f_a16_orc_impl(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_16sc_s32f_magnitude_32f_a16_orc(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_16sc_s32f_magnitude_32f_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_16sc_s32f_magnitude_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
deleted file mode 100644
index a0f97f94e..000000000
--- a/volk/include/volk/volk_32f_32f_32f_sum_of_poly_32f_a16.h
+++ /dev/null
@@ -1,151 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
-#define INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#ifndef MAX
-#define MAX(X,Y) ((X) > (Y)?(X):(Y))
-#endif
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-  
-  
-  float result = 0.0;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-  //float fith = 0.0;
-  
-  
-  
-  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
-
-  xmm9 = _mm_setzero_ps();
-  xmm1 = _mm_setzero_ps();
-  
-  xmm0 = _mm_load1_ps(&center_point_array[0]);
-  xmm6 = _mm_load1_ps(&center_point_array[1]);
-  xmm7 = _mm_load1_ps(&center_point_array[2]);
-  xmm8 = _mm_load1_ps(&center_point_array[3]);
-  //xmm11 = _mm_load1_ps(&center_point_array[4]);
-  xmm10 = _mm_load1_ps(cutoff);
-  
-  int bound = num_bytes >> 4;
-  int leftovers = (num_bytes >> 2) & 3;
-  int i = 0;
-  
-  for(; i < bound; ++i) {
-    xmm2 = _mm_load_ps(src0);
-    xmm2 = _mm_max_ps(xmm10, xmm2);
-    xmm3 = _mm_mul_ps(xmm2, xmm2);
-    xmm4 = _mm_mul_ps(xmm2, xmm3);
-    xmm5 = _mm_mul_ps(xmm3, xmm3);
-    //xmm12 = _mm_mul_ps(xmm3, xmm4);
-
-    xmm2 = _mm_mul_ps(xmm2, xmm0);
-    xmm3 = _mm_mul_ps(xmm3, xmm6);
-    xmm4 = _mm_mul_ps(xmm4, xmm7);
-    xmm5 = _mm_mul_ps(xmm5, xmm8);
-    //xmm12 = _mm_mul_ps(xmm12, xmm11);
-
-    xmm2 = _mm_add_ps(xmm2, xmm3);
-    xmm3 = _mm_add_ps(xmm4, xmm5);
-    
-    src0 += 4;
-    
-    xmm9 = _mm_add_ps(xmm2, xmm9);
-    
-    xmm1 = _mm_add_ps(xmm3, xmm1);
-
-    //xmm9 = _mm_add_ps(xmm12, xmm9);
-  }
-  
-  xmm2 = _mm_hadd_ps(xmm9, xmm1);
-  xmm3 = _mm_hadd_ps(xmm2, xmm2);
-  xmm4 = _mm_hadd_ps(xmm3, xmm3);
-
-  _mm_store_ss(&result, xmm4);
-    
-  
-
-  for(i = 0; i < leftovers; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-    
-    result += (center_point_array[0] * fst + 
-	       center_point_array[1] * sq + 
-	       center_point_array[2] * thrd + 
-	       center_point_array[3] * frth);// + 
-	       //center_point_array[4] * fith);
-  }
-
-  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
-
-  target[0] = result;
-}
- 
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-
-static inline void volk_32f_32f_32f_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
-
-
-    
-  float result = 0.0;
-  float fst = 0.0;
-  float sq = 0.0;
-  float thrd = 0.0;
-  float frth = 0.0;
-  //float fith = 0.0;
-  
-
-
-  int i = 0; 
-  
-  for(; i < num_bytes >> 2; ++i) {
-    fst = src0[i];
-    fst = MAX(fst, *cutoff);
-    
-    sq = fst * fst;
-    thrd = fst * sq;
-    frth = sq * sq;
-    //fith = sq * thrd;
-    
-    result += (center_point_array[0] * fst + 
-	       center_point_array[1] * sq + 
-	       center_point_array[2] * thrd + 
-	       center_point_array[3] * frth); //+
-	       //center_point_array[4] * fith);
-    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
-		  center_point_array[1] * sq + 
-		  center_point_array[2] * thrd + 
-			 center_point_array[3] * frth) +
-	   //center_point_array[4] * fith) + 
-	   (center_point_array[4]), i);
-    */
-  }
-
-  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
-
-  
-  
-  *target = result;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_32f_32f_32f_sum_of_poly_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_add_32f_a16.h b/volk/include/volk/volk_32f_32f_add_32f_a16.h
deleted file mode 100644
index ba38c310f..000000000
--- a/volk/include/volk/volk_32f_32f_add_32f_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_add_32f_a16_H
-#define INCLUDED_volk_32f_32f_add_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_32f_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_add_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-static inline void volk_32f_32f_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) + (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Adds the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be added
-  \param bVector One of the vectors to be added
-  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
-*/
-extern void volk_32f_32f_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_add_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_divide_32f_a16.h b/volk/include/volk/volk_32f_32f_divide_32f_a16.h
deleted file mode 100644
index a0995e631..000000000
--- a/volk/include/volk/volk_32f_32f_divide_32f_a16.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_divide_32f_a16_H
-#define INCLUDED_volk_32f_32f_divide_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_32f_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_div_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) / (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-static inline void volk_32f_32f_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) / (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Divides the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be divideed
-  \param bVector The divisor vector
-  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
-*/
-extern void volk_32f_32f_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_volk_32f_32f_divide_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
deleted file mode 100644
index 63f5221d3..000000000
--- a/volk/include/volk/volk_32f_32f_dot_prod_32f_a16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
-#define INCLUDED_volk_32f_32f_dot_prod_32f_a16_H
-
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_32f_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-  
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_32f_dot_prod_32f_a16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_load_ps(aPtr); 
-    bVal = _mm_load_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_load_ps(aPtr); 
-    bVal = _mm_load_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){      
-
-    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
-    
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h b/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
deleted file mode 100644
index b5fa7d7a4..000000000
--- a/volk/include/volk/volk_32f_32f_dot_prod_32f_ua16.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
-#define INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H
-
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr=  taps;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-  
-  *result = dotProduct;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE
-
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
-  
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_loadu_ps(aPtr); 
-    bVal = _mm_loadu_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_add_ps(cVal, dotProdVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal, bVal, cVal;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < quarterPoints; number++){
-      
-    aVal = _mm_loadu_ps(aPtr); 
-    bVal = _mm_loadu_ps(bPtr);
-      
-    cVal = _mm_mul_ps(aVal, bVal); 
-
-    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
-
-    aPtr += 4;
-    bPtr += 4;
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
-
-  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-
-  number = quarterPoints * 4;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32f_32f_dot_prod_32f_ua16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
-  unsigned int number = 0;
-  const unsigned int sixteenthPoints = num_points / 16;
-
-  float dotProduct = 0;
-  const float* aPtr = input;
-  const float* bPtr = taps;
-
-  __m128 aVal1, bVal1, cVal1;
-  __m128 aVal2, bVal2, cVal2;
-  __m128 aVal3, bVal3, cVal3;
-  __m128 aVal4, bVal4, cVal4;
-
-  __m128 dotProdVal = _mm_setzero_ps();
-
-  for(;number < sixteenthPoints; number++){
-      
-    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
-    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
-
-    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
-    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
-    
-    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
-    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
-    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
-    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
-
-    cVal1 = _mm_or_ps(cVal1, cVal2);
-    cVal3 = _mm_or_ps(cVal3, cVal4);
-    cVal1 = _mm_or_ps(cVal1, cVal3);
-
-    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
-  }
-
-  float dotProductVector[4] __attribute__((aligned(16)));
-  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct = dotProductVector[0];
-  dotProduct += dotProductVector[1];
-  dotProduct += dotProductVector[2];
-  dotProduct += dotProductVector[3];
-
-  number = sixteenthPoints * 16;
-  for(;number < num_points; number++){
-    dotProduct += ((*aPtr++) * (*bPtr++));
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_volk_32f_32f_dot_prod_32f_ua16_H*/
diff --git a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h b/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
deleted file mode 100644
index 34ea93349..000000000
--- a/volk/include/volk/volk_32f_32f_interleave_32fc_a16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_interleave_32fc_a16_H
-#define INCLUDED_volk_32f_32f_interleave_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Interleaves the I & Q vector data into the complex vector
-  \param iBuffer The I buffer data to be interleaved
-  \param qBuffer The Q buffer data to be interleaved
-  \param complexVector The complex output vector
-  \param num_points The number of complex data values to be interleaved
-*/
-static inline void volk_32f_32f_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
-  unsigned int number = 0;
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-
-  const uint64_t quarterPoints = num_points / 4;
-    
-  __m128 iValue, qValue, cplxValue;
-  for(;number < quarterPoints; number++){
-    iValue = _mm_load_ps(iBufferPtr);
-    qValue = _mm_load_ps(qBufferPtr);
-
-    // Interleaves the lower two values in the i and q variables into one buffer
-    cplxValue = _mm_unpacklo_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    // Interleaves the upper two values in the i and q variables into one buffer
-    cplxValue = _mm_unpackhi_ps(iValue, qValue);
-    _mm_store_ps(complexVectorPtr, cplxValue);
-    complexVectorPtr += 4;
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Interleaves the I & Q vector data into the complex vector.
-  \param iBuffer The I buffer data to be interleaved
-  \param qBuffer The Q buffer data to be interleaved
-  \param complexVector The complex output vector
-  \param num_points The number of complex data values to be interleaved
-*/
-static inline void volk_32f_32f_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
-  float* complexVectorPtr = (float*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = *iBufferPtr++;
-    *complexVectorPtr++ = *qBufferPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_32f_interleave_32fc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_max_32f_a16.h b/volk/include/volk/volk_32f_32f_max_32f_a16.h
deleted file mode 100644
index 8ca7a5ba8..000000000
--- a/volk/include/volk/volk_32f_32f_max_32f_a16.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_max_32f_a16_H
-#define INCLUDED_volk_32f_32f_max_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_max_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-extern void volk_32f_32f_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_max_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_min_32f_a16.h b/volk/include/volk/volk_32f_32f_min_32f_a16.h
deleted file mode 100644
index dd05988be..000000000
--- a/volk/include/volk/volk_32f_32f_min_32f_a16.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_min_32f_a16_H
-#define INCLUDED_volk_32f_32f_min_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_min_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_32f_32f_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const float a = *aPtr++;
-      const float b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-extern void volk_32f_32f_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_min_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h b/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
deleted file mode 100644
index 2d004db10..000000000
--- a/volk/include/volk/volk_32f_32f_multiply_32f_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_multiply_32f_a16_H
-#define INCLUDED_volk_32f_32f_multiply_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_32f_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_mul_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_32f_32f_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Multiplys the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be multiplied
-  \param bVector One of the vectors to be multiplied
-  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
-*/
-extern void volk_32f_32f_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_multiply_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h b/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
deleted file mode 100644
index 207382a19..000000000
--- a/volk/include/volk/volk_32f_32f_s32f_interleave_16sc_a16.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
-#define INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const float* iBufferPtr = iBuffer;
-    const float* qBufferPtr = qBuffer;
-
-    __m128 vScalar = _mm_set_ps1(scalar);
-
-    const unsigned int quarterPoints = num_points / 4;
-    
-    __m128 iValue, qValue, cplxValue1, cplxValue2;
-    __m128i intValue1, intValue2;
-
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    for(;number < quarterPoints; number++){
-      iValue = _mm_load_ps(iBufferPtr);
-      qValue = _mm_load_ps(qBufferPtr);
-
-      // Interleaves the lower two values in the i and q variables into one buffer
-      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
-      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
-
-      // Interleaves the upper two values in the i and q variables into one buffer
-      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
-      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
-
-      intValue1 = _mm_cvtps_epi32(cplxValue1);
-      intValue2 = _mm_cvtps_epi32(cplxValue2);
-
-      intValue1 = _mm_packs_epi32(intValue1, intValue2);
-
-      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
-      complexVectorPtr += 8;
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)(&complexVector[number]);
-    for(; number < num_points; number++){
-      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-    }
-    
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_32f_s32f_interleave_16sc_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const float* iBufferPtr = iBuffer;
-    const float* qBufferPtr = qBuffer;
-
-    __m128 vScalar = _mm_set_ps1(scalar);
-
-    const unsigned int quarterPoints = num_points / 4;
-    
-    __m128 iValue, qValue, cplxValue;
-
-    int16_t* complexVectorPtr = (int16_t*)complexVector;
-
-    float floatBuffer[4] __attribute__((aligned(128)));
-
-    for(;number < quarterPoints; number++){
-      iValue = _mm_load_ps(iBufferPtr);
-      qValue = _mm_load_ps(qBufferPtr);
-
-      // Interleaves the lower two values in the i and q variables into one buffer
-      cplxValue = _mm_unpacklo_ps(iValue, qValue);
-      cplxValue = _mm_mul_ps(cplxValue, vScalar);
-
-      _mm_store_ps(floatBuffer, cplxValue);
-
-      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
-
-      // Interleaves the upper two values in the i and q variables into one buffer
-      cplxValue = _mm_unpackhi_ps(iValue, qValue);
-      cplxValue = _mm_mul_ps(cplxValue, vScalar);
- 
-      _mm_store_ps(floatBuffer, cplxValue);
-      
-      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
-      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
-
-      iBufferPtr += 4;
-      qBufferPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    complexVectorPtr = (int16_t*)(&complexVector[number]);
-    for(; number < num_points; number++){
-      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-    }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
-    \param iBuffer The I buffer data to be interleaved
-    \param qBuffer The Q buffer data to be interleaved
-    \param complexVector The complex output vector
-    \param scalar The scaling value being multiplied against each data point
-    \param num_points The number of complex data values to be interleaved
-  */
-static inline void volk_32f_32f_s32f_interleave_16sc_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
-  int16_t* complexVectorPtr = (int16_t*)complexVector;
-  const float* iBufferPtr = iBuffer;
-  const float* qBufferPtr = qBuffer;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
-    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_32f_s32f_interleave_16sc_a16_H */
diff --git a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h b/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
deleted file mode 100644
index 9fea6aa27..000000000
--- a/volk/include/volk/volk_32f_32f_subtract_32f_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32f_32f_subtract_32f_a16_H
-#define INCLUDED_volk_32f_32f_subtract_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-static inline void volk_32f_32f_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_sub_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      *cPtr++ = (*aPtr++) - (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-static inline void volk_32f_32f_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    float* cPtr = cVector;
-    const float* aPtr = aVector;
-    const float* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) - (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Subtracts bVector form aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The initial vector
-  \param bVector The vector to be subtracted
-  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
-*/
-extern void volk_32f_32f_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
-static inline void volk_32f_32f_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
-    volk_32f_32f_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32f_32f_subtract_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_u.h b/volk/include/volk/volk_32f_convert_64f_u.h
new file mode 100644
index 000000000..a825767de
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_u.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ 
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_u_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_ua16.h b/volk/include/volk/volk_32f_convert_64f_ua16.h
deleted file mode 100644
index c8de768dc..000000000
--- a/volk/include/volk/volk_32f_convert_64f_ua16.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef INCLUDED_volk_32f_convert_64f_ua16_H
-#define INCLUDED_volk_32f_convert_64f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the float values into double values
-    \param dVector The converted double vector values
-    \param fVector The float vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_32f_convert_64f_ua16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  double* outputVectorPtr = outputVector;
-  __m128d ret;
-  __m128 inputVal;
-
-  for(;number < quarterPoints; number++){
-    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
- 
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-
-    inputVal = _mm_movehl_ps(inputVal, inputVal);
-
-    ret = _mm_cvtps_pd(inputVal);
-
-    _mm_storeu_pd(outputVectorPtr, ret);
-    outputVectorPtr += 2;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (double)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the float values into double values
-  \param dVector The converted double vector values
-  \param fVector The float vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_32f_convert_64f_ua16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
-  double* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_convert_64f_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_a16.h b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h
new file mode 100644
index 000000000..d6b16e336
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_a16.h
@@ -0,0 +1,110 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a16_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_16i_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16i_u.h b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
new file mode 100644
index 000000000..4d306e53c
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_16i_u.h
@@ -0,0 +1,113 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_16i_u_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h b/volk/include/volk/volk_32f_s32f_convert_16s_a16.h
deleted file mode 100644
index cf51cf9c5..000000000
--- a/volk/include/volk/volk_32f_s32f_convert_16s_a16.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_16s_a16_H
-#define INCLUDED_volk_32f_s32f_convert_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16s_a16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;    
-  for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16s_a16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_16s_a16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_16s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
deleted file mode 100644
index 53d159f82..000000000
--- a/volk/include/volk/volk_32f_s32f_convert_16s_ua16.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_16s_ua16_H
-#define INCLUDED_volk_32f_s32f_convert_16s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_16s_ua16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int eighthPoints = num_points / 8;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2;
-  __m128i intInputVal1, intInputVal2;
-
-  for(;number < eighthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 8;
-  }
-
-  number = eighthPoints * 8;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_16s_ua16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int16_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int16_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_16s_ua16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_16s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
new file mode 100644
index 000000000..ae874fd7b
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_a16_H
+#define INCLUDED_volk_32f_s32f_convert_32i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32i_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32i_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_32i_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_u.h b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
new file mode 100644
index 000000000..561fcd800
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_u.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_32i_u_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h b/volk/include/volk/volk_32f_s32f_convert_32s_a16.h
deleted file mode 100644
index 0be649418..000000000
--- a/volk/include/volk/volk_32f_s32f_convert_32s_a16.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_32s_a16_H
-#define INCLUDED_volk_32f_s32f_convert_32s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_32s_a16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_32s_a16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_32s_a16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_32s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
deleted file mode 100644
index efb2c3a20..000000000
--- a/volk/include/volk/volk_32f_s32f_convert_32s_ua16.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_32s_ua16_H
-#define INCLUDED_volk_32f_s32f_convert_32s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32s_ua16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1;
-  __m128i intInputVal1;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32s_ua16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int32_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int32_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 32 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_32s_ua16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int32_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_32s_ua16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_a16.h b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
new file mode 100644
index 000000000..c91448951
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_a16.h
@@ -0,0 +1,117 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a16_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_s32f_convert_8i_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8i_u.h b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
new file mode 100644
index 000000000..420693571
--- /dev/null
+++ b/volk/include/volk/volk_32f_s32f_convert_8i_u.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
+#define INCLUDED_volk_32f_s32f_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_s32f_convert_8i_u_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h b/volk/include/volk/volk_32f_s32f_convert_8s_a16.h
deleted file mode 100644
index 69ccec5c6..000000000
--- a/volk/include/volk/volk_32f_s32f_convert_8s_a16.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_8s_a16_H
-#define INCLUDED_volk_32f_s32f_convert_8s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8s_a16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8s_a16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_load_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32f_s32f_convert_8s_a16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_8s_a16_H */
diff --git a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h b/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
deleted file mode 100644
index af1652b19..000000000
--- a/volk/include/volk/volk_32f_s32f_convert_8s_ua16.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef INCLUDED_volk_32f_s32f_convert_8s_ua16_H
-#define INCLUDED_volk_32f_s32f_convert_8s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_8s_ua16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int sixteenthPoints = num_points / 16;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
-  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
-
-  for(;number < sixteenthPoints; number++){
-    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
-
-    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
-    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
-    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
-    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
-    
-    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
-    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
-
-    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
-
-    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
-    outputVectorPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_8s_ua16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const float* inputVectorPtr = (const float*)inputVector;
-  int8_t* outputVectorPtr = outputVector;
-  __m128 vScalar = _mm_set_ps1(scalar);
-  __m128 ret;
-
-  float outputFloatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    ret = _mm_loadu_ps(inputVectorPtr);
-    inputVectorPtr += 4;
-
-    ret = _mm_mul_ps(ret, vScalar);
-
-    _mm_store_ps(outputFloatBuffer, ret);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
-    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (int8_t)(inputVector[number] * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#ifdef LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
-    \param inputVector The floating point input data buffer
-    \param outputVector The 8 bit output data buffer
-    \param scalar The value multiplied against each point in the input buffer
-    \param num_points The number of data values to be converted
-    \note Input buffer does NOT need to be properly aligned
-  */
-static inline void volk_32f_s32f_convert_8s_ua16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
-  int8_t* outputVectorPtr = outputVector;
-  const float* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_s32f_convert_8s_ua16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
deleted file mode 100644
index 2ba809845..000000000
--- a/volk/include/volk/volk_32f_stddev_and_mean_32f_32f_a16.h
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H
-#define INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const float* aPtr = inputBuffer;
-    float meanBuffer[4] __attribute__((aligned(128)));
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal1, aVal2, aVal3, aVal4;
-    __m128 cVal1, cVal2, cVal3, cVal4;
-    for(;number < sixteenthPoints; number++) {
-      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
-      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
-      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
-
-      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
-      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
-
-      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
-      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
-
-      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
-      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
-      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
-
-      cVal1 = _mm_or_ps(cVal1, cVal2);
-      cVal3 = _mm_or_ps(cVal3, cVal4);
-      cVal1 = _mm_or_ps(cVal1, cVal3);
-
-      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = sixteenthPoints * 16;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_32f_32f_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    const float* aPtr = inputBuffer;
-    float meanBuffer[4] __attribute__((aligned(128)));
-    float squareBuffer[4] __attribute__((aligned(128)));
-
-    __m128 accumulator = _mm_setzero_ps();
-    __m128 squareAccumulator = _mm_setzero_ps();
-    __m128 aVal = _mm_setzero_ps();
-    for(;number < quarterPoints; number++) {
-      aVal = _mm_load_ps(aPtr);                     // aVal = x
-      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
-      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
-      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
-      aPtr += 4;
-    }
-    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
-    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
-    newMean = meanBuffer[0];
-    newMean += meanBuffer[1];
-    newMean += meanBuffer[2];
-    newMean += meanBuffer[3];
-    returnValue = squareBuffer[0];
-    returnValue += squareBuffer[1];
-    returnValue += squareBuffer[2];
-    returnValue += squareBuffer[3];
-  
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the standard deviation and mean of the input buffer
-  \param stddev The calculated standard deviation
-  \param mean The mean of the input buffer
-  \param inputBuffer The buffer of points to calculate the std deviation for
-  \param num_points The number of values in input buffer to used in the stddev and mean calculations
-*/
-static inline void volk_32f_stddev_and_mean_32f_32f_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
-  float returnValue = 0;
-  float newMean = 0;
-  if(num_points > 0){
-    const float* aPtr = inputBuffer;
-    unsigned int number = 0;
-    
-    for(number = 0; number < num_points; number++){
-      returnValue += (*aPtr) * (*aPtr);
-      newMean += *aPtr++;
-    }
-    newMean /= num_points;
-    returnValue /= num_points;
-    returnValue -= (newMean * newMean);
-    returnValue = sqrt(returnValue);
-  }
-  *stddev = returnValue;
-  *mean = newMean;
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h
new file mode 100644
index 000000000..278089841
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_32f_x2_a16.h
@@ -0,0 +1,169 @@
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_32f_x2_a16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_add_32f_a16.h b/volk/include/volk/volk_32f_x2_add_32f_a16.h
new file mode 100644
index 000000000..d0d0e0a0e
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_add_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_add_32f_a16_H
+#define INCLUDED_volk_32f_x2_add_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_add_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_x2_add_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+extern void volk_32f_x2_add_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_add_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_add_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_divide_32f_a16.h b/volk/include/volk/volk_32f_x2_divide_32f_a16.h
new file mode 100644
index 000000000..d844e25b0
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_divide_32f_a16.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_volk_32f_x2_divide_32f_a16_H
+#define INCLUDED_volk_32f_x2_divide_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_div_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_x2_divide_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+extern void volk_32f_x2_divide_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_divide_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_divide_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h
new file mode 100644
index 000000000..61aa56815
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_a16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a16_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a16_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){      
+
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a16_H*/
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
new file mode 100644
index 000000000..8469a3cea
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+      
+    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
diff --git a/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h
new file mode 100644
index 000000000..29c9392df
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_interleave_32fc_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a16_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  unsigned int number = 0;
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+
+  const uint64_t quarterPoints = num_points / 4;
+    
+  __m128 iValue, qValue, cplxValue;
+  for(;number < quarterPoints; number++){
+    iValue = _mm_load_ps(iBufferPtr);
+    qValue = _mm_load_ps(qBufferPtr);
+
+    // Interleaves the lower two values in the i and q variables into one buffer
+    cplxValue = _mm_unpacklo_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    // Interleaves the upper two values in the i and q variables into one buffer
+    cplxValue = _mm_unpackhi_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector.
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_x2_interleave_32fc_a16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_max_32f_a16.h b/volk/include/volk/volk_32f_x2_max_32f_a16.h
new file mode 100644
index 000000000..26e7f1246
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_max_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_max_32f_a16_H
+#define INCLUDED_volk_32f_x2_max_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_max_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_max_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_max_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_max_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_max_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_min_32f_a16.h b/volk/include/volk/volk_32f_x2_min_32f_a16.h
new file mode 100644
index 000000000..23bae044c
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_min_32f_a16.h
@@ -0,0 +1,85 @@
+#ifndef INCLUDED_volk_32f_x2_min_32f_a16_H
+#define INCLUDED_volk_32f_x2_min_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_min_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_x2_min_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+extern void volk_32f_x2_min_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_min_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_min_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_multiply_32f_a16.h b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
new file mode 100644
index 000000000..a0dcfa86e
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_multiply_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_a16_H
+#define INCLUDED_volk_32f_x2_multiply_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_x2_multiply_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+extern void volk_32f_x2_multiply_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_multiply_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_multiply_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h
new file mode 100644
index 000000000..30306774d
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_s32f_interleave_16ic_a16.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue1, cplxValue2;
+    __m128i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+      intValue1 = _mm_cvtps_epi32(cplxValue1);
+      intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+      intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+      complexVectorPtr += 8;
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_x2_s32f_interleave_16ic_a16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[4] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+      _mm_store_ps(floatBuffer, cplxValue);
+
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+ 
+      _mm_store_ps(floatBuffer, cplxValue);
+      
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_x2_s32f_interleave_16ic_a16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a16_H */
diff --git a/volk/include/volk/volk_32f_x2_subtract_32f_a16.h b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h
new file mode 100644
index 000000000..7404bfe79
--- /dev/null
+++ b/volk/include/volk/volk_32f_x2_subtract_32f_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_a16_H
+#define INCLUDED_volk_32f_x2_subtract_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_sub_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_x2_subtract_32f_a16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+extern void volk_32f_x2_subtract_32f_a16_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+static inline void volk_32f_x2_subtract_32f_a16_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    volk_32f_x2_subtract_32f_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_a16_H */
diff --git a/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h
new file mode 100644
index 000000000..af9e39537
--- /dev/null
+++ b/volk/include/volk/volk_32f_x3_sum_of_poly_32f_a16.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+  
+  
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+  
+  
+  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+  xmm9 = _mm_setzero_ps();
+  xmm1 = _mm_setzero_ps();
+  
+  xmm0 = _mm_load1_ps(&center_point_array[0]);
+  xmm6 = _mm_load1_ps(&center_point_array[1]);
+  xmm7 = _mm_load1_ps(&center_point_array[2]);
+  xmm8 = _mm_load1_ps(&center_point_array[3]);
+  //xmm11 = _mm_load1_ps(&center_point_array[4]);
+  xmm10 = _mm_load1_ps(cutoff);
+  
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 2) & 3;
+  int i = 0;
+  
+  for(; i < bound; ++i) {
+    xmm2 = _mm_load_ps(src0);
+    xmm2 = _mm_max_ps(xmm10, xmm2);
+    xmm3 = _mm_mul_ps(xmm2, xmm2);
+    xmm4 = _mm_mul_ps(xmm2, xmm3);
+    xmm5 = _mm_mul_ps(xmm3, xmm3);
+    //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+    xmm2 = _mm_mul_ps(xmm2, xmm0);
+    xmm3 = _mm_mul_ps(xmm3, xmm6);
+    xmm4 = _mm_mul_ps(xmm4, xmm7);
+    xmm5 = _mm_mul_ps(xmm5, xmm8);
+    //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+    xmm2 = _mm_add_ps(xmm2, xmm3);
+    xmm3 = _mm_add_ps(xmm4, xmm5);
+    
+    src0 += 4;
+    
+    xmm9 = _mm_add_ps(xmm2, xmm9);
+    
+    xmm1 = _mm_add_ps(xmm3, xmm1);
+
+    //xmm9 = _mm_add_ps(xmm12, xmm9);
+  }
+  
+  xmm2 = _mm_hadd_ps(xmm9, xmm1);
+  xmm3 = _mm_hadd_ps(xmm2, xmm2);
+  xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+  _mm_store_ss(&result, xmm4);
+    
+  
+
+  for(i = 0; i < leftovers; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth);// + 
+	       //center_point_array[4] * fith);
+  }
+
+  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+  target[0] = result;
+}
+ 
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32f_x3_sum_of_poly_32f_a16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+    
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+
+
+  int i = 0; 
+  
+  for(; i < num_bytes >> 2; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth); //+
+	       //center_point_array[4] * fith);
+    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
+		  center_point_array[1] * sq + 
+		  center_point_array[2] * thrd + 
+			 center_point_array[3] * frth) +
+	   //center_point_array[4] * fith) + 
+	   (center_point_array[4]), i);
+    */
+  }
+
+  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+  
+  
+  *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
deleted file mode 100644
index cd9cc8160..000000000
--- a/volk/include/volk/volk_32fc_32fc_conjugate_dot_prod_32fc_a16.h
+++ /dev/null
@@ -1,344 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H
-#define INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H
-
-#include<volk/volk_complex.h>
-#include<stdio.h>
-
-
-#if LV_HAVE_GENERIC
-
-
-static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-  
-  
-  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  int i = 0;
-
-  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    
-
-    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
-    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
-    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
-    
-    
-    in += 4;
-    tp += 4;
-
-  }
- 
-  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  
-  
-  
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
-
-  }
-  /*
-  for(i = 0; i < num_bytes >> 3; ++i) {
-    *result += input[i] * conjf(taps[i]);
-  }
-  */
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE && LV_HAVE_64
-
-
-static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-  
-
-
-
-  asm volatile 
-    (
-     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %[conjugator], %%r9\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movaps 0(%%r9), %%xmm8\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%r9), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movups	0(%%r10), %%xmm2\n\t"
-     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "  xorps  %%xmm8, %%xmm2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%r9), %%xmmA\n\t"
-     "#	movaps	(%%r10), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%r9), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%r10), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "  xorps   %%xmm8, %%xmm3\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%r9), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	add	$32, %%r9\n\t"
-     "	movaps	32(%%r10), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	add	$32, %%r10\n\t"
-     "  xorps   %%xmm8, %%xmm2\n\t"
-     ".%=L1_test:\n\t"
-     "	dec	%%rax\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	and	$1, %%r8\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "	mov	$0x80000000, %%r9\n\t"
-     "	movd	%%r9, %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
-     :"rax", "r8", "r9", "r10"
-     );
-  
-  
-  int getem = num_bytes % 16;
-  
-  
-  for(; getem > 0; getem -= 8) {
-  
-    
-    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
-  
-  }
-
-  return;
-}  
-#endif
-
-#if LV_HAVE_SSE && LV_HAVE_32
-static inline void volk_32fc_32fc_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
-
-  int bound = num_bytes >> 4;
-  int leftovers = num_bytes % 16;
-
-  
-  asm volatile 
-    (
-     "	#pushl	%%ebp\n\t"
-     "	#movl	%%esp, %%ebp\n\t"
-     "	#movl	12(%%ebp), %%eax		# input\n\t"
-     "	#movl	16(%%ebp), %%edx		# taps\n\t"
-     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%[eax]), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%[edx]), %%xmm2\n\t"
-     "  movl    %[ecx], (%[out])\n\t"
-     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
-     
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%[eax]), %%xmmA\n\t"
-     "#	movaps	(%[edx]), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%[edx]), %%xmm3\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "  xorps   %%xmm1, %%xmm3\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	movaps	16(%[eax]), %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "  movaps  0(%[conjugator]), %%xmm1\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%[eax]), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	addl	$32, %[eax]\n\t"
-     "	movaps	32(%[edx]), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "  xorps   %%xmm1, %%xmm2\n\t"
-     "	addl	$32, %[edx]\n\t"
-     ".%=L1_test:\n\t"
-     "	decl	%[ecx]\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %[ecx]\n\t"
-     "	andl	$1, %[ecx]\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "  #movl 8(%%ebp), %[eax] \n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "  movl	$0x80000000, (%[out])\n\t"
-     "	movss	(%[out]), %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
-     "	#popl	%%ebp\n\t"
-     :
-     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
-     );
-
-  
-  
-  
-  printf("%d, %d\n", leftovers, bound);
-  
-  for(; leftovers > 0; leftovers -= 8) {
-    
-    
-    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
-    
-  }
-  
-  return;
-  
-  
-  
-
-  
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-
-
-#endif /*INCLUDED_volk_32fc_32fc_conjugate_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
deleted file mode 100644
index 2ccfcf2f2..000000000
--- a/volk/include/volk/volk_32fc_32fc_dot_prod_32fc_a16.h
+++ /dev/null
@@ -1,468 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H
-#define INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H
-
-#include <volk/volk_complex.h>
-#include <stdio.h>
-#include <string.h>
-
-
-#if LV_HAVE_GENERIC 
-
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  float * res = (float*) result;
-  float * in = (float*) input;
-  float * tp = (float*) taps;
-  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
-  unsigned int isodd = (num_bytes >> 3) &1;
-  
-  
-  
-  float sum0[2] = {0,0};
-  float sum1[2] = {0,0};
-  int i = 0;
-
-  
-  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
-    
-
-    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
-    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
-    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
-    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
-    
-    
-    in += 4;
-    tp += 4;
-
-  }
-
-  
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  
-  
-  
-  for(i = 0; i < isodd; ++i) {
-
-
-    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
-
-  }
-
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#if LV_HAVE_SSE && LV_HAVE_64
-
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-
-  asm 
-    (
-     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
-     "#                         const float *taps, unsigned num_bytes)\n\t"
-     "#    float sum0 = 0;\n\t"
-     "#    float sum1 = 0;\n\t"
-     "#    float sum2 = 0;\n\t"
-     "#    float sum3 = 0;\n\t"
-     "#    do {\n\t"
-     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
-     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
-     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
-     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
-     "#      input += 4;\n\t"
-     "#      taps += 4;  \n\t"
-     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
-     "#    result[0] = sum0 + sum2;\n\t"
-     "#    result[1] = sum1 + sum3;\n\t"
-     "# TODO: prefetch and better scheduling\n\t"
-     "  xor    %%r9,  %%r9\n\t"
-     "  xor    %%r10, %%r10\n\t"
-     "  movq   %%rcx, %%rax\n\t"
-     "  movq   %%rcx, %%r8\n\t"
-     "  movq   %[rsi],  %%r9\n\t"
-     "  movq   %[rdx], %%r10\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%r9), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%%r10), %%xmm2\n\t"
-     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
-     "  shr     $4, %%r8\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%r9), %%xmmA\n\t"
-     "#	movaps	(%%r10), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%r9), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%r10), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%r9), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	add	$32, %%r9\n\t"
-     "	movaps	32(%%r10), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	add	$32, %%r10\n\t"
-     ".%=L1_test:\n\t"
-     "	dec	%%rax\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	and	$1, %%r8\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "	mov	$0x80000000, %%r9\n\t"
-     "	movd	%%r9, %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
-     :
-     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
-     :"rax", "r8", "r9", "r10"
-     );
-  
-  
-  int getem = num_bytes % 16;
-  
-  
-  for(; getem > 0; getem -= 8) {
-  
-    
-    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-  
-  }
-
-  return;
-  
-}
-
-#endif
-
-#if LV_HAVE_SSE && LV_HAVE_32
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-  asm volatile 
-    (
-     "	#pushl	%%ebp\n\t"
-     "	#movl	%%esp, %%ebp\n\t"
-     "	movl	12(%%ebp), %%eax		# input\n\t"
-     "	movl	16(%%ebp), %%edx		# taps\n\t"
-     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
-     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
-     "	movaps	0(%%eax), %%xmm0\n\t"
-     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
-     "	movaps	0(%%edx), %%xmm2\n\t"
-     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
-     "	jmp	.%=L1_test\n\t"
-     "	# 4 taps / loop\n\t"
-     "	# something like ?? cycles / loop\n\t"
-     ".%=Loop1:	\n\t"
-     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
-     "#	movaps	(%%eax), %%xmmA\n\t"
-     "#	movaps	(%%edx), %%xmmB\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
-     "#	mulps	%%xmmB, %%xmmA\n\t"
-     "#	mulps	%%xmmZ, %%xmmB\n\t"
-     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
-     "#	xorps	%%xmmPN, %%xmmA\n\t"
-     "#	movaps	%%xmmA, %%xmmZ\n\t"
-     "#	unpcklps %%xmmB, %%xmmA\n\t"
-     "#	unpckhps %%xmmB, %%xmmZ\n\t"
-     "#	movaps	%%xmmZ, %%xmmY\n\t"
-     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
-     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
-     "#	addps	%%xmmZ, %%xmmA\n\t"
-     "#	addps	%%xmmA, %%xmmC\n\t"
-     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
-     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
-     "	movaps	16(%%eax), %%xmm1\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	movaps	16(%%edx), %%xmm3\n\t"
-     "	movaps	%%xmm1, %%xmm5\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm3, %%xmm1\n\t"
-     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
-     "	addps	%%xmm1, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	movaps	32(%%eax), %%xmm0\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     "	mulps	%%xmm5, %%xmm3\n\t"
-     "	addl	$32, %%eax\n\t"
-     "	movaps	32(%%edx), %%xmm2\n\t"
-     "	addps	%%xmm3, %%xmm7\n\t"
-     "	addl	$32, %%edx\n\t"
-     ".%=L1_test:\n\t"
-     "	decl	%%ecx\n\t"
-     "	jge	.%=Loop1\n\t"
-     "	# We've handled the bulk of multiplies up to here.\n\t"
-     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
-     "	# If so, we've got 2 more taps to do.\n\t"
-     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
-     "  shrl    $4, %%ecx\n\t"
-     "	andl	$1, %%ecx\n\t"
-     "	je	.%=Leven\n\t"
-     "	# The count was odd, do 2 more taps.\n\t"
-     "	# Note that we've already got mm0/mm2 preloaded\n\t"
-     "	# from the main loop.\n\t"
-     "	movaps	%%xmm0, %%xmm4\n\t"
-     "	mulps	%%xmm2, %%xmm0\n\t"
-     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
-     "	addps	%%xmm0, %%xmm6\n\t"
-     "	mulps	%%xmm4, %%xmm2\n\t"
-     "	addps	%%xmm2, %%xmm7\n\t"
-     ".%=Leven:\n\t"
-     "	# neg inversor\n\t"
-     "  movl 8(%%ebp), %%eax \n\t"
-     "	xorps	%%xmm1, %%xmm1\n\t"
-     "  movl	$0x80000000, (%%eax)\n\t"
-     "	movss	(%%eax), %%xmm1\n\t"
-     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
-     "	# pfpnacc\n\t"
-     "	xorps	%%xmm1, %%xmm6\n\t"
-     "	movaps	%%xmm6, %%xmm2\n\t"
-     "	unpcklps %%xmm7, %%xmm6\n\t"
-     "	unpckhps %%xmm7, %%xmm2\n\t"
-     "	movaps	%%xmm2, %%xmm3\n\t"
-     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
-     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
-     "	addps	%%xmm2, %%xmm6\n\t"
-     "					# xmm6 = r1 i2 r3 i4\n\t"
-     "	#movl	8(%%ebp), %%eax		# @result\n\t"
-     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
-     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
-     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
-     "	#popl	%%ebp\n\t"
-     :
-     :
-     : "eax", "ecx", "edx"
-     );
-
-  
-  int getem = num_bytes % 16;
-  
-  for(; getem > 0; getem -= 8) {
-    
-    
-    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
-    
-  }
-  
-  return;
-  
-  
-  
-
-  
-  
-}
-
-#endif /*LV_HAVE_SSE*/  
-
-#if LV_HAVE_SSE3
-
-#include <pmmintrin.h>
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  
-
-  lv_32fc_t dotProduct;
-  memset(&dotProduct, 0x0, 2*sizeof(float));
-
-  unsigned int number = 0;
-  const unsigned int halfPoints = num_bytes >> 4;
-
-  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
-
-  const lv_32fc_t* a = input;
-  const lv_32fc_t* b = taps;
-
-  dotProdVal = _mm_setzero_ps();
-
-  for(;number < halfPoints; number++){
-      
-    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-      
-    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-      
-    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-      
-    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-      
-    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-      
-    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
-
-    a += 2;
-    b += 2;
-  }
-
-  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
-
-  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
-
-  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
-
-  if((num_bytes >> 2) != 0) {
-    dotProduct += (*a) * (*b);
-  }
-
-  *result = dotProduct;
-}  
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_SSE4_1
-
-#include <smmintrin.h>
-
-static inline void volk_32fc_32fc_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
-  volk_32fc_32fc_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes);
-  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
-   /* 
-    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
-    float *p_input, *p_taps;
-    __m64 *p_result;
-
-    p_result = (__m64*)result;
-    p_input = (float*)input;
-    p_taps = (float*)taps;
-
-    static const __m128i neg = {0x000000000000000080000000};
-
-    int i = 0;
-  
-    int bound = (num_bytes >> 5);
-    int leftovers = (num_bytes & 24) >> 3;
-
-    real0 = _mm_sub_ps(real0, real0);
-    real1 = _mm_sub_ps(real1, real1);
-    im0 = _mm_sub_ps(im0, im0);
-    im1 = _mm_sub_ps(im1, im1);
-  
-    for(; i < bound; ++i) {
-  
-    
-    xmm0 = _mm_load_ps(p_input);
-    xmm1 = _mm_load_ps(p_taps);
-    
-    p_input += 4;
-    p_taps += 4;
-    
-    xmm2 = _mm_load_ps(p_input);
-    xmm3 = _mm_load_ps(p_taps);
-    
-    p_input += 4;
-    p_taps += 4;
-    
-    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
-    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
-    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
-    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
-    
-    //imaginary vector from input
-    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
-    //real vector from input
-    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
-    //imaginary vector from taps
-    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
-    //real vector from taps
-    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
-    
-    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
-    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
-    
-    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
-    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
-    
-    real0 = _mm_add_ps(xmm4, real0);
-    real1 = _mm_add_ps(xmm5, real1);
-    im0 = _mm_add_ps(xmm6, im0);
-    im1 = _mm_add_ps(xmm7, im1);
-    
-    }
-
-
-    
-    
-    real1 = _mm_xor_ps(real1, (__m128)neg);
-    
-  
-    im0 = _mm_add_ps(im0, im1);
-    real0 = _mm_add_ps(real0, real1);
-  
-    im0 = _mm_add_ps(im0, real0);
-  
-    _mm_storel_pi(p_result, im0);
-  
-    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
-    
-    *result += input[i] * taps[i];
-    }
-  */
-}  
-
-#endif /*LV_HAVE_SSE4_1*/
-
-#endif /*INCLUDED_volk_32fc_32fc_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
deleted file mode 100644
index 59259882c..000000000
--- a/volk/include/volk/volk_32fc_32fc_multiply_32fc_a16.h
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H
-#define INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-#include <float.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_32fc_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-
-    for(;number < halfPoints; number++){
-      
-      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
-      
-      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
-      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
-      
-      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-      
-      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
-      
-      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-      
-      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-    
-      _mm_store_ps((float*)c,z); // Store the results back into the C container
-
-      a += 2;
-      b += 2;
-      c += 2;
-    }
-
-    if((num_points % 2) != 0) {
-      *c = (*a) * (*b);
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-static inline void volk_32fc_32fc_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) * (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Multiplies the two input complex vectors and stores their results in the third vector
-    \param cVector The vector where the results will be stored
-    \param aVector One of the vectors to be multiplied
-    \param bVector One of the vectors to be multiplied
-    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-  */
-extern void volk_32fc_32fc_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
-static inline void volk_32fc_32fc_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
-    static const float mask = -0.0;
-    volk_32fc_32fc_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-
-
-#endif /* INCLUDED_volk_32fc_32fc_multiply_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
deleted file mode 100644
index 14f511697..000000000
--- a/volk/include/volk/volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16.h
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H
-#define INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-#include <string.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-  
-
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-
-  lv_32fc_t diff;
-  memset(&diff, 0x0, 2*sizeof(float));
-
-  float sq_dist = 0.0;
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-  
-  
-  
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm8 = _mm_load1_ps(&scalar);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
-  
-  
-  for(; i < bound - 1; ++i) {
-  
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    xmm5 = _mm_sub_ps(xmm1, xmm3);
-    points += 4;
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-    xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
-    xmm3 = _mm_load_ps((float*)&points[2]);
-    
-    xmm4 = _mm_mul_ps(xmm4, xmm8);
-
-    _mm_store_ps(target, xmm4);
-
-    target += 4;
-
-  }
-  
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-  
-  
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
-  
-  xmm4 = _mm_mul_ps(xmm4, xmm8);
-   
-  _mm_store_ps(target, xmm4);
-  
-  target += 4;
-  
-
-  for(i = 0; i < leftovers0; ++i) {
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    
-    points += 2;
-    
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
-
-    xmm4 = _mm_mul_ps(xmm4, xmm8);
-    
-    _mm_storeh_pi((__m64*)target, xmm4);
-
-    target += 2;
-  }
-
-  for(i = 0; i < leftovers1; ++i) {
-    
-    diff = src0[0] - points[0];
-
-    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-
-    target[0] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
-  lv_32fc_t diff;
-  float sq_dist;
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
-
-    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
-    
-    target[i] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_32fc_32fc_s32f_square_dist_scalar_mult_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
deleted file mode 100644
index b6c72adbf..000000000
--- a/volk/include/volk/volk_32fc_32fc_square_dist_32f_a16.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H
-#define INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H
-
-#include<inttypes.h>
-#include<stdio.h>
-#include<volk/volk_complex.h>
-
-#if LV_HAVE_SSE3
-#include<xmmintrin.h>
-#include<pmmintrin.h>
-
-static inline void volk_32fc_32fc_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-  
-
-  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-
-  lv_32fc_t diff;
-  float sq_dist;
-  int bound = num_bytes >> 5;
-  int leftovers0 = (num_bytes >> 4) & 1;
-  int leftovers1 = (num_bytes >> 3) & 1;
-  int i = 0;
-
-  xmm1 = _mm_setzero_ps();
-  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
-  xmm2 = _mm_load_ps((float*)&points[0]);
-  xmm1 = _mm_movelh_ps(xmm1, xmm1);
-  xmm3 = _mm_load_ps((float*)&points[2]);
-  
-
-  for(; i < bound - 1; ++i) {
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    xmm5 = _mm_sub_ps(xmm1, xmm3);
-    points += 4;
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-    xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_hadd_ps(xmm6, xmm7);
-
-    xmm3 = _mm_load_ps((float*)&points[2]);
-
-    _mm_store_ps(target, xmm4);
-
-    target += 4;
-
-  }
-  
-  xmm4 = _mm_sub_ps(xmm1, xmm2);
-  xmm5 = _mm_sub_ps(xmm1, xmm3);
-  
-  
-
-  points += 4;
-  xmm6 = _mm_mul_ps(xmm4, xmm4);
-  xmm7 = _mm_mul_ps(xmm5, xmm5);
-    
-  xmm4 = _mm_hadd_ps(xmm6, xmm7);
-   
-  _mm_store_ps(target, xmm4);
-  
-  target += 4;
-
-  for(i = 0; i < leftovers0; ++i) {
-    
-    xmm2 = _mm_load_ps((float*)&points[0]);
-    
-    xmm4 = _mm_sub_ps(xmm1, xmm2);
-    
-    points += 2;
-    
-    xmm6 = _mm_mul_ps(xmm4, xmm4);
-
-    xmm4 = _mm_hadd_ps(xmm6, xmm6);
-    
-    _mm_storeh_pi((__m64*)target, xmm4);
-
-    target += 2;
-  }
-
-  for(i = 0; i < leftovers1; ++i) {
-    
-    diff = src0[0] - points[0];
-
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-
-    target[0] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_SSE3*/
-
-#if LV_HAVE_GENERIC
-static inline void volk_32fc_32fc_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
-  lv_32fc_t diff;
-  float sq_dist;
-  int i = 0; 
-  
-  for(; i < num_bytes >> 3; ++i) {
-    diff = src0[0] - points[i];
-
-    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
-    
-    target[i] = sq_dist;
-  }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#endif /*INCLUDED_volk_32fc_32fc_square_dist_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
deleted file mode 100644
index 3ee579c2e..000000000
--- a/volk/include/volk/volk_32fc_deinterleave_32f_32f_a16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H
-#define INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-  for(;number < quarterPoints; number++){
-      
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector into I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    *qBufferPtr++ = *complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h
new file mode 100644
index 000000000..84d2576ed
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_x2_a16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
deleted file mode 100644
index 404defc36..000000000
--- a/volk/include/volk/volk_32fc_deinterleave_64f_64f_a16.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H
-#define INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_64f_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-
-    const float* complexVectorPtr = (float*)complexVector;
-    double* iBufferPtr = iBuffer;
-    double* qBufferPtr = qBuffer;
-
-    const unsigned int halfPoints = num_points / 2;    
-    __m128 cplxValue, fVal;
-    __m128d dVal;
-
-    for(;number < halfPoints; number++){
-      
-      cplxValue = _mm_load_ps(complexVectorPtr);
-      complexVectorPtr += 4;
-
-      // Arrange in i1i2i1i2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
-      dVal = _mm_cvtps_pd(fVal); 
-      _mm_store_pd(iBufferPtr, dVal);
-
-      // Arrange in q1q2q1q2 format
-      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
-      dVal = _mm_cvtps_pd(fVal); 
-      _mm_store_pd(qBufferPtr, dVal);
-
-      iBufferPtr += 2;
-      qBufferPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(; number < num_points; number++){
-      *iBufferPtr++ = *complexVectorPtr++;
-      *qBufferPtr++ = *complexVectorPtr++;
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_64f_64f_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const float* complexVectorPtr = (float*)complexVector;
-  double* iBufferPtr = iBuffer;
-  double* qBufferPtr = qBuffer;
-
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (double)*complexVectorPtr++;
-    *qBufferPtr++ = (double)*complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_64f_64f_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h
new file mode 100644
index 000000000..34262a7af
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_x2_a16.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;    
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for(;number < halfPoints; number++){
+      
+      cplxValue = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 2;
+      qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  double* qBufferPtr = qBuffer;
+
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    *qBufferPtr++ = (double)*complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
new file mode 100644
index 000000000..6042e6d62
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_16i_a16.h
@@ -0,0 +1,80 @@
+#ifndef INCLUDED_volk_32fc_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_32fc_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16i_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    iValue = _mm_mul_ps(iValue, vScalar);
+
+    _mm_store_ps(floatBuffer, iValue);
+    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  iBufferPtr = &iBuffer[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
deleted file mode 100644
index 53235e5f7..000000000
--- a/volk/include/volk/volk_32fc_deinterleave_real_16s_a16.h
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef INCLUDED_volk_32fc_deinterleave_real_16s_a16_H
-#define INCLUDED_volk_32fc_deinterleave_real_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16s_a16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-
-    iValue = _mm_mul_ps(iValue, vScalar);
-
-    _mm_store_ps(floatBuffer, iValue);
-    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
-    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  iBufferPtr = &iBuffer[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
-  \param complexVector The complex input vector
-  \param scalar The value to be multiply against each of the input values
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_32fc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
-    complexVectorPtr++;
-  }
-
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h
new file mode 100644
index 000000000..530359600
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_magnitude_16i_a16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_s32f_magnitude_16i_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+extern void volk_32fc_s32f_magnitude_16i_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
+static inline void volk_32fc_s32f_magnitude_16i_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+    volk_32fc_s32f_magnitude_16i_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h b/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
deleted file mode 100644
index dc3c6741a..000000000
--- a/volk/include/volk/volk_32fc_s32f_magnitude_16s_a16.h
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H
-#define INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_s32f_magnitude_16s_a16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, result;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
-    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
-
-    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result);
-
-    result = _mm_mul_ps(result, vScalar);
-
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_s32f_magnitude_16s_a16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  const float* complexVectorPtr = (const float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-
-  __m128 vScalar = _mm_set_ps1(scalar);
-
-  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    cplxValue1 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    cplxValue2 = _mm_load_ps(complexVectorPtr);
-    complexVectorPtr += 4;
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    // Arrange in q1q2q3q4 format
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
-    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
-
-    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
-
-    result = _mm_sqrt_ps(result);
-
-    result = _mm_mul_ps(result, vScalar);
-
-    _mm_store_ps(floatBuffer, result);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
-    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
-  }
-
-  number = quarterPoints * 4;
-  magnitudeVectorPtr = &magnitudeVector[number];
-  for(; number < num_points; number++){
-    float val1Real = *complexVectorPtr++;
-    float val1Imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-static inline void volk_32fc_s32f_magnitude_16s_a16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-  const float* complexVectorPtr = (float*)complexVector;
-  int16_t* magnitudeVectorPtr = magnitudeVector;
-  unsigned int number = 0;
-  for(number = 0; number < num_points; number++){
-    const float real = *complexVectorPtr++;
-    const float imag = *complexVectorPtr++;
-    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
-  \param complexVector The vector containing the complex input values
-  \param scalar The scale value multiplied to the magnitude of each complex vector
-  \param magnitudeVector The vector containing the real output values
-  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
-*/
-extern void volk_32fc_s32f_magnitude_16s_a16_orc_impl(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points);
-static inline void volk_32fc_s32f_magnitude_16s_a16_orc(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
-    volk_32fc_s32f_magnitude_16s_a16_orc_impl(magnitudeVector, complexVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32fc_s32f_magnitude_16s_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
deleted file mode 100644
index 29ccdaef7..000000000
--- a/volk/include/volk/volk_32fc_s32f_s32f_power_spectral_density_32f_a16.h
+++ /dev/null
@@ -1,134 +0,0 @@
-#ifndef INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H
-#define INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <math.h>
-
-#if LV_HAVE_SSE3
-#include <pmmintrin.h>
-
-#if LV_HAVE_LIB_SIMDMATH
-#include <simdmath.h>
-#endif /* LV_HAVE_LIB_SIMDMATH */
-
-/*!
-  \brief Calculates the log10 power value divided by the RBW for each input point
-  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param rbw The resolution bandwith of the fft spectrum
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
-  const float* inputPtr = (const float*)complexFFTInput;
-  float* destPtr = logPowerOutput;
-  uint64_t number = 0;
-  const float iRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-#if LV_HAVE_LIB_SIMDMATH
-  __m128 magScalar = _mm_set_ps1(10.0);
-  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
-
-  __m128 invRBW = _mm_set_ps1(iRBW);
-  
-  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
-
-  __m128 power;
-  __m128 input1, input2;
-  const uint64_t quarterPoints = num_points / 4;
-  for(;number < quarterPoints; number++){
-    // Load the complex values 
-    input1 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    input2 =_mm_load_ps(inputPtr);
-    inputPtr += 4;
-    
-    // Apply the normalization factor
-    input1 = _mm_mul_ps(input1, invNormalizationFactor);
-    input2 = _mm_mul_ps(input2, invNormalizationFactor);
-
-    // Multiply each value by itself
-    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
-    input1 = _mm_mul_ps(input1, input1);
-    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
-    input2 = _mm_mul_ps(input2, input2);
-    
-    // Horizontal add, to add (r*r) + (i*i) for each complex value
-    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
-    power = _mm_hadd_ps(input1, input2);
-
-    // Divide by the rbw
-    power = _mm_mul_ps(power, invRBW);
-
-    // Calculate the natural log power
-    power = logf4(power);
-    
-    // Convert to log10 and multiply by 10.0
-    power = _mm_mul_ps(power, magScalar);
-    
-    // Store the floating point results
-    _mm_store_ps(destPtr, power);
-    
-    destPtr += 4;
-  }
-  
-  number = quarterPoints*4;  
-#endif /* LV_HAVE_LIB_SIMDMATH */
-  // Calculate the FFT for any remaining points
-  for(; number < num_points; number++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-    
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
-    destPtr++;
-  }
-  
-}
-#endif /* LV_HAVE_SSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Calculates the log10 power value divided by the RBW for each input point
-  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
-  \param complexFFTInput The complex data output from the FFT point
-  \param normalizationFactor This value is divided against all the input values before the power is calculated
-  \param rbw The resolution bandwith of the fft spectrum
-  \param num_points The number of fft data points
-*/
-static inline void volk_32fc_s32f_s32f_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
-  // Calculate the Power of the complex point
-  const float* inputPtr = (float*)complexFFTInput;
-  float* realFFTDataPointsPtr = logPowerOutput;
-  unsigned int point;
-  const float invRBW = 1.0 / rbw;
-  const float iNormalizationFactor = 1.0 / normalizationFactor;
-
-  for(point = 0; point < num_points; point++){
-    // Calculate dBm
-    // 50 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
-    // 75 ohm load assumption
-    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
-
-    const float real = *inputPtr++ * iNormalizationFactor;
-    const float imag = *inputPtr++ * iNormalizationFactor;
-
-    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
-    
-    realFFTDataPointsPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32fc_s32f_s32f_power_spectral_density_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h
new file mode 100644
index 000000000..0120b5307
--- /dev/null
+++ b/volk/include/volk/volk_32fc_s32f_x2_power_spectral_density_32f_a16.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H
+#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 magScalar = _mm_set_ps1(10.0);
+  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+  __m128 invRBW = _mm_set_ps1(iRBW);
+  
+  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+  __m128 power;
+  __m128 input1, input2;
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the complex values 
+    input1 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    input2 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    
+    // Apply the normalization factor
+    input1 = _mm_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm_mul_ps(input2, input2);
+    
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    power = _mm_hadd_ps(input1, input2);
+
+    // Divide by the rbw
+    power = _mm_mul_ps(power, invRBW);
+
+    // Calculate the natural log power
+    power = logf4(power);
+    
+    // Convert to log10 and multiply by 10.0
+    power = _mm_mul_ps(power, magScalar);
+    
+    // Store the floating point results
+    _mm_store_ps(destPtr, power);
+    
+    destPtr += 4;
+  }
+  
+  number = quarterPoints*4;  
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+    
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+    destPtr++;
+  }
+  
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  // Calculate the Power of the complex point
+  const float* inputPtr = (float*)complexFFTInput;
+  float* realFFTDataPointsPtr = logPowerOutput;
+  unsigned int point;
+  const float invRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+  for(point = 0; point < num_points; point++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+    
+    realFFTDataPointsPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a16_H */
diff --git a/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h
new file mode 100644
index 000000000..a01971df3
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_conjugate_dot_prod_32fc_a16.h
@@ -0,0 +1,344 @@
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H
+
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+ 
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+  }
+  /*
+  for(i = 0; i < num_bytes >> 3; ++i) {
+    *result += input[i] * conjf(taps[i]);
+  }
+  */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  
+
+
+
+  asm volatile 
+    (
+     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %[conjugator], %%r9\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movaps 0(%%r9), %%xmm8\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movups	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "  xorps  %%xmm8, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "  xorps   %%xmm8, %%xmm3\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     "  xorps   %%xmm8, %%xmm2\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+  
+  }
+
+  return;
+}  
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+  int bound = num_bytes >> 4;
+  int leftovers = num_bytes % 16;
+
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	#movl	12(%%ebp), %%eax		# input\n\t"
+     "	#movl	16(%%ebp), %%edx		# taps\n\t"
+     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%[eax]), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%[edx]), %%xmm2\n\t"
+     "  movl    %[ecx], (%[out])\n\t"
+     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%[eax]), %%xmmA\n\t"
+     "#	movaps	(%[edx]), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%[edx]), %%xmm3\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "  xorps   %%xmm1, %%xmm3\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	movaps	16(%[eax]), %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%[eax]), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %[eax]\n\t"
+     "	movaps	32(%[edx]), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	addl	$32, %[edx]\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%[ecx]\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %[ecx]\n\t"
+     "	andl	$1, %[ecx]\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  #movl 8(%%ebp), %[eax] \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%[out])\n\t"
+     "	movss	(%[out]), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+     );
+
+  
+  
+  
+  printf("%d, %d\n", leftovers, bound);
+  
+  for(; leftovers > 0; leftovers -= 8) {
+    
+    
+    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h
new file mode 100644
index 000000000..9a7b65ab4
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_dot_prod_32fc_a16.h
@@ -0,0 +1,468 @@
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#if LV_HAVE_GENERIC 
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+
+  }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  asm 
+    (
+     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+  
+  }
+
+  return;
+  
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	movl	12(%%ebp), %%eax		# input\n\t"
+     "	movl	16(%%ebp), %%edx		# taps\n\t"
+     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%eax), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%edx), %%xmm2\n\t"
+     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%eax), %%xmmA\n\t"
+     "#	movaps	(%%edx), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%eax), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%edx), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%eax), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %%eax\n\t"
+     "	movaps	32(%%edx), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	addl	$32, %%edx\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%%ecx\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %%ecx\n\t"
+     "	andl	$1, %%ecx\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  movl 8(%%ebp), %%eax \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%%eax)\n\t"
+     "	movss	(%%eax), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %%eax		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     :
+     : "eax", "ecx", "edx"
+     );
+
+  
+  int getem = num_bytes % 16;
+  
+  for(; getem > 0; getem -= 8) {
+    
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int halfPoints = num_bytes >> 4;
+
+  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm_setzero_ps();
+
+  for(;number < halfPoints; number++){
+      
+    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+    a += 2;
+    b += 2;
+  }
+
+  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
+
+  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+  if((num_bytes >> 2) != 0) {
+    dotProduct += (*a) * (*b);
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  volk_32fc_x2_dot_prod_32fc_a16_sse3(result, input, taps, num_bytes);
+  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
+   /* 
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+    float *p_input, *p_taps;
+    __m64 *p_result;
+
+    p_result = (__m64*)result;
+    p_input = (float*)input;
+    p_taps = (float*)taps;
+
+    static const __m128i neg = {0x000000000000000080000000};
+
+    int i = 0;
+  
+    int bound = (num_bytes >> 5);
+    int leftovers = (num_bytes & 24) >> 3;
+
+    real0 = _mm_sub_ps(real0, real0);
+    real1 = _mm_sub_ps(real1, real1);
+    im0 = _mm_sub_ps(im0, im0);
+    im1 = _mm_sub_ps(im1, im1);
+  
+    for(; i < bound; ++i) {
+  
+    
+    xmm0 = _mm_load_ps(p_input);
+    xmm1 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm2 = _mm_load_ps(p_input);
+    xmm3 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+    
+    //imaginary vector from input
+    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+    //real vector from input
+    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+    //imaginary vector from taps
+    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+    //real vector from taps
+    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+    
+    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+    
+    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+    
+    real0 = _mm_add_ps(xmm4, real0);
+    real1 = _mm_add_ps(xmm5, real1);
+    im0 = _mm_add_ps(xmm6, im0);
+    im1 = _mm_add_ps(xmm7, im1);
+    
+    }
+
+
+    
+    
+    real1 = _mm_xor_ps(real1, (__m128)neg);
+    
+  
+    im0 = _mm_add_ps(im0, im1);
+    real0 = _mm_add_ps(real0, real1);
+  
+    im0 = _mm_add_ps(im0, real0);
+  
+    _mm_storel_pi(p_result, im0);
+  
+    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
+    
+    *result += input[i] * taps[i];
+    }
+  */
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a16_H*/
diff --git a/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
new file mode 100644
index 000000000..224ab19c8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_multiply_32fc_a16.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a16_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+#include <float.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_a16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_x2_multiply_32fc_a16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+extern void volk_32fc_x2_multiply_32fc_a16_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, float mask, unsigned int num_points);
+static inline void volk_32fc_x2_multiply_32fc_a16_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    static const float mask = -0.0;
+    volk_32fc_x2_multiply_32fc_a16_orc_impl(cVector, aVector, bVector, mask, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a16_H */
diff --git a/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h
new file mode 100644
index 000000000..6a863b16d
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+  lv_32fc_t diff;
+  memset(&diff, 0x0, 2*sizeof(float));
+
+  float sq_dist = 0.0;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+  
+  
+  
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm8 = _mm_load1_ps(&scalar);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+  
+  for(; i < bound - 1; ++i) {
+  
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+    
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+  
+  xmm4 = _mm_mul_ps(xmm4, xmm8);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+  
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a16_H*/
diff --git a/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h
new file mode 100644
index 000000000..406097fc8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_x2_square_dist_32f_a16.h
@@ -0,0 +1,112 @@
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a16_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_a16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_a16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  lv_32fc_t diff;
+  float sq_dist;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+
+  for(; i < bound - 1; ++i) {
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_x2_square_dist_32f_a16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a16_H*/
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_a16.h b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h
new file mode 100644
index 000000000..0fcadd9cb
--- /dev/null
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_a16.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a16_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32i_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_store_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32i_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_32i_s32f_convert_32f_u.h b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
new file mode 100644
index 000000000..1dd6422f8
--- /dev/null
+++ b/volk/include/volk/volk_32i_s32f_convert_32f_u.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32i_s32f_convert_32f_u_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_32i_x2_and_32i_a16.h b/volk/include/volk/volk_32i_x2_and_32i_a16.h
new file mode 100644
index 000000000..3baa1d856
--- /dev/null
+++ b/volk/include/volk/volk_32i_x2_and_32i_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_and_32i_a16_H
+#define INCLUDED_volk_32i_x2_and_32i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_and_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32i_x2_and_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+extern void volk_32i_x2_and_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_and_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32i_x2_and_32i_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_a16_H */
diff --git a/volk/include/volk/volk_32i_x2_or_32i_a16.h b/volk/include/volk/volk_32i_x2_or_32i_a16.h
new file mode 100644
index 000000000..0be22f00a
--- /dev/null
+++ b/volk/include/volk/volk_32i_x2_or_32i_a16.h
@@ -0,0 +1,81 @@
+#ifndef INCLUDED_volk_32i_x2_or_32i_a16_H
+#define INCLUDED_volk_32i_x2_or_32i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_or_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32i_x2_or_32i_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+extern void volk_32i_x2_or_32i_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
+static inline void volk_32i_x2_or_32i_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    volk_32i_x2_or_32i_a16_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_a16_H */
diff --git a/volk/include/volk/volk_32s_32s_and_32s_a16.h b/volk/include/volk/volk_32s_32s_and_32s_a16.h
deleted file mode 100644
index 0e8380757..000000000
--- a/volk/include/volk/volk_32s_32s_and_32s_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32s_32s_and_32s_a16_H
-#define INCLUDED_volk_32s_32s_and_32s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-static inline void volk_32s_32s_and_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = (float*)cVector;
-    const float* aPtr = (float*)aVector;
-    const float* bPtr = (float*)bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_and_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      cVector[number] = aVector[number] & bVector[number];
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-static inline void volk_32s_32s_and_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    int32_t* cPtr = cVector;
-    const int32_t* aPtr = aVector;
-    const int32_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) & (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Ands the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors
-  \param bVector One of the vectors
-  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
-*/
-extern void volk_32s_32s_and_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32s_32s_and_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    volk_32s_32s_and_32s_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32s_32s_and_32s_a16_H */
diff --git a/volk/include/volk/volk_32s_32s_or_32s_a16.h b/volk/include/volk/volk_32s_32s_or_32s_a16.h
deleted file mode 100644
index 2dcf2e551..000000000
--- a/volk/include/volk/volk_32s_32s_or_32s_a16.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef INCLUDED_volk_32s_32s_or_32s_a16_H
-#define INCLUDED_volk_32s_32s_or_32s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-static inline void volk_32s_32s_or_32s_a16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-
-    float* cPtr = (float*)cVector;
-    const float* aPtr = (float*)aVector;
-    const float* bPtr = (float*)bVector;
-
-    __m128 aVal, bVal, cVal;
-    for(;number < quarterPoints; number++){
-      
-      aVal = _mm_load_ps(aPtr); 
-      bVal = _mm_load_ps(bPtr);
-      
-      cVal = _mm_or_ps(aVal, bVal); 
-      
-      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 4;
-      bPtr += 4;
-      cPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(;number < num_points; number++){
-      cVector[number] = aVector[number] | bVector[number];
-    }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-static inline void volk_32s_32s_or_32s_a16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    int32_t* cPtr = cVector;
-    const int32_t* aPtr = aVector;
-    const int32_t* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = (*aPtr++) | (*bPtr++);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-/*!
-  \brief Ors the two input vectors and store their results in the third vector
-  \param cVector The vector where the results will be stored
-  \param aVector One of the vectors to be ored
-  \param bVector One of the vectors to be ored
-  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
-*/
-extern void volk_32s_32s_or_32s_a16_orc_impl(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points);
-static inline void volk_32s_32s_or_32s_a16_orc(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
-    volk_32s_32s_or_32s_a16_orc_impl(cVector, aVector, bVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-#endif /* INCLUDED_volk_32s_32s_or_32s_a16_H */
diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h b/volk/include/volk/volk_32s_s32f_convert_32f_a16.h
deleted file mode 100644
index c16ecc9dd..000000000
--- a/volk/include/volk/volk_32s_s32f_convert_32f_a16.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_32s_s32f_convert_32f_a16_H
-#define INCLUDED_volk_32s_s32f_convert_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32s_s32f_convert_32f_a16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_load_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_store_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_32s_s32f_convert_32f_a16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32s_s32f_convert_32f_a16_H */
diff --git a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
deleted file mode 100644
index 4eb5a5b85..000000000
--- a/volk/include/volk/volk_32s_s32f_convert_32f_ua16.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef INCLUDED_volk_32s_s32f_convert_32f_ua16_H
-#define INCLUDED_volk_32s_s32f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32s_s32f_convert_32f_ua16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    
-     float* outputVectorPtr = outputVector;
-     const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    int32_t* inputPtr = (int32_t*)inputVector;
-    __m128i inputVal;
-    __m128 ret;
-
-    for(;number < quarterPoints; number++){
-
-      // Load the 4 values
-      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
-
-      ret = _mm_cvtepi32_ps(inputVal);
-      ret = _mm_mul_ps(ret, invScalar);
-
-      _mm_storeu_ps(outputVectorPtr, ret);
-
-      outputVectorPtr += 4;
-      inputPtr += 4;
-    }
-
-    number = quarterPoints * 4;
-    for(; number < num_points; number++){
-      outputVector[number] =((float)(inputVector[number])) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 32 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_32s_s32f_convert_32f_ua16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int32_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_32s_s32f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_64f_64f_max_64f_a16.h b/volk/include/volk/volk_64f_64f_max_64f_a16.h
deleted file mode 100644
index 7e091851f..000000000
--- a/volk/include/volk/volk_64f_64f_max_64f_a16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_volk_64f_64f_max_64f_a16_H
-#define INCLUDED_volk_64f_64f_max_64f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-
-    __m128d aVal, bVal, cVal;
-    for(;number < halfPoints; number++){
-      
-      aVal = _mm_load_pd(aPtr); 
-      bVal = _mm_load_pd(bPtr);
-      
-      cVal = _mm_max_pd(aVal, bVal); 
-      
-      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 2;
-      bPtr += 2;
-      cPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(;number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a > b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_64f_64f_max_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_64f_min_64f_a16.h b/volk/include/volk/volk_64f_64f_min_64f_a16.h
deleted file mode 100644
index f2bcbe83b..000000000
--- a/volk/include/volk/volk_64f_64f_min_64f_a16.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef INCLUDED_volk_64f_64f_min_64f_a16_H
-#define INCLUDED_volk_64f_64f_min_64f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-
-    __m128d aVal, bVal, cVal;
-    for(;number < halfPoints; number++){
-      
-      aVal = _mm_load_pd(aPtr); 
-      bVal = _mm_load_pd(bPtr);
-      
-      cVal = _mm_min_pd(aVal, bVal); 
-      
-      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
-
-      aPtr += 2;
-      bPtr += 2;
-      cPtr += 2;
-    }
-
-    number = halfPoints * 2;
-    for(;number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_SSE2 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
-  \param cVector The vector where the results will be stored
-  \param aVector The vector to be checked
-  \param bVector The vector to be checked
-  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
-*/
-static inline void volk_64f_64f_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
-    double* cPtr = cVector;
-    const double* aPtr = aVector;
-    const double* bPtr=  bVector;
-    unsigned int number = 0;
-
-    for(number = 0; number < num_points; number++){
-      const double a = *aPtr++;
-      const double b = *bPtr++;
-      *cPtr++ = ( a < b ? a : b);
-    }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#endif /* INCLUDED_volk_64f_64f_min_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_u.h b/volk/include/volk/volk_64f_convert_32f_u.h
new file mode 100644
index 000000000..6338c1433
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_u.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ 
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_storeu_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_u_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_ua16.h b/volk/include/volk/volk_64f_convert_32f_ua16.h
deleted file mode 100644
index 7774db1b7..000000000
--- a/volk/include/volk/volk_64f_convert_32f_ua16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_volk_64f_convert_32f_ua16_H
-#define INCLUDED_volk_64f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE2
-#include <emmintrin.h>
-  /*!
-    \brief Converts the double values into float values
-    \param dVector The converted float vector values
-    \param fVector The double vector values to be converted
-    \param num_points The number of points in the two vectors to be converted
-  */
-static inline void volk_64f_convert_32f_ua16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
-  unsigned int number = 0;
-
-  const unsigned int quarterPoints = num_points / 4;
-    
-  const double* inputVectorPtr = (const double*)inputVector;
-  float* outputVectorPtr = outputVector;
-  __m128 ret, ret2;
-  __m128d inputVal1, inputVal2;
-
-  for(;number < quarterPoints; number++){
-    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
-    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
- 
-    ret = _mm_cvtpd_ps(inputVal1);
-    ret2 = _mm_cvtpd_ps(inputVal2);
-
-    ret = _mm_movelh_ps(ret, ret2);
-
-    _mm_storeu_ps(outputVectorPtr, ret);
-    outputVectorPtr += 4;
-  }
-
-  number = quarterPoints * 4;    
-  for(; number < num_points; number++){
-    outputVector[number] = (float)(inputVector[number]);
-  }
-}
-#endif /* LV_HAVE_SSE2 */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Converts the double values into float values
-  \param dVector The converted float vector values
-  \param fVector The double vector values to be converted
-  \param num_points The number of points in the two vectors to be converted
-*/
-static inline void volk_64f_convert_32f_ua16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const double* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_64f_convert_32f_ua16_H */
diff --git a/volk/include/volk/volk_64f_x2_max_64f_a16.h b/volk/include/volk/volk_64f_x2_max_64f_a16.h
new file mode 100644
index 000000000..4b0c1f5f1
--- /dev/null
+++ b/volk/include/volk/volk_64f_x2_max_64f_a16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_max_64f_a16_H
+#define INCLUDED_volk_64f_x2_max_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_max_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_max_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_a16_H */
diff --git a/volk/include/volk/volk_64f_x2_min_64f_a16.h b/volk/include/volk/volk_64f_x2_min_64f_a16.h
new file mode 100644
index 000000000..aa961e384
--- /dev/null
+++ b/volk/include/volk/volk_64f_x2_min_64f_a16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_volk_64f_x2_min_64f_a16_H
+#define INCLUDED_volk_64f_x2_min_64f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_min_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_x2_min_64f_a16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_a16_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_a16.h b/volk/include/volk/volk_8i_convert_16i_a16.h
new file mode 100644
index 000000000..3d7045753
--- /dev/null
+++ b/volk/include/volk/volk_8i_convert_16i_a16.h
@@ -0,0 +1,83 @@
+#ifndef INCLUDED_volk_8i_convert_16i_a16_H
+#define INCLUDED_volk_8i_convert_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_convert_16i_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_convert_16i_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8i_convert_16i_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
+static inline void volk_8i_convert_16i_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    volk_8i_convert_16i_a16_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_convert_16i_u.h b/volk/include/volk/volk_8i_convert_16i_u.h
new file mode 100644
index 000000000..bcff13406
--- /dev/null
+++ b/volk/include/volk/volk_8i_convert_16i_u.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8i_convert_16i_u_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_a16.h b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
new file mode 100644
index 000000000..d5c8eeb51
--- /dev/null
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_a16.h
@@ -0,0 +1,105 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_a16_H
+#define INCLUDED_volk_8i_s32f_convert_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8i_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_ORC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+extern void volk_8i_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
+static inline void volk_8i_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    volk_8i_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8i_s32f_convert_32f_u.h b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
new file mode 100644
index 000000000..1e30957e8
--- /dev/null
+++ b/volk/include/volk/volk_8i_s32f_convert_32f_u.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1( iScalar );
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8i_s32f_convert_32f_u_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h
new file mode 100644
index 000000000..91c9b2c58
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_16i_x2_a16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+  __m128i complexVal, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
+    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+
+    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a16_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h
new file mode 100644
index 000000000..bf3dc20dd
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_real_16i_a16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a16_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i complexVal, outputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    outputVal = _mm_cvtepi8_epi16(complexVal);
+    outputVal = _mm_slli_epi16(outputVal, 7);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a16_H */
diff --git a/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h
new file mode 100644
index 000000000..13de79423
--- /dev/null
+++ b/volk/include/volk/volk_8ic_deinterleave_real_8i_a16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, outputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+    outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_8i_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h
new file mode 100644
index 000000000..22c3ebb23
--- /dev/null
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_32f_x2_a16.h
@@ -0,0 +1,164 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue, qFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+    qBufferPtr += 4;
+
+    qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+
+    qBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(complexVectorPtr[0]);
+    floatBuffer[1] = (float)(complexVectorPtr[1]);
+    floatBuffer[2] = (float)(complexVectorPtr[2]);
+    floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    floatBuffer[4] = (float)(complexVectorPtr[4]);
+    floatBuffer[5] = (float)(complexVectorPtr[5]);
+    floatBuffer[6] = (float)(complexVectorPtr[6]);
+    floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int8_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a16_H */
diff --git a/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h
new file mode 100644
index 000000000..5f1430394
--- /dev/null
+++ b/volk/include/volk/volk_8ic_s32f_deinterleave_real_32f_a16.h
@@ -0,0 +1,133 @@
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+
+    complexVal = _mm_srli_si128(complexVal, 4);
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h
new file mode 100644
index 000000000..d9cacbf46
--- /dev/null
+++ b/volk/include/volk/volk_8ic_x2_multiply_conjugate_16ic_a16.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  lv_16sc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+    
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+      
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+      
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+    a += 4;
+    b += 4;
+    c += 4;
+  }
+    
+  number = quarterPoints * 4;
+  int16_t* c16Ptr = (int16_t*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  int16_t* c16Ptr = (int16_t*)cVector;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number =0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a16_H */
diff --git a/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h
new file mode 100644
index 000000000..6ec923a4f
--- /dev/null
+++ b/volk/include/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc_a16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  __m128 ret;
+  lv_32fc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    a += 4;
+    b += 4;
+  }
+
+  number = quarterPoints * 4;
+  float* cFloatPtr = (float*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cFloatPtr++ = lv_creal(temp) / scalar;
+    *cFloatPtr++ = lv_cimag(temp) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* cPtr = (float*)cVector;
+  const float invScalar = 1.0 / scalar;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number = 0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cPtr++ = (lv_creal(temp) * invScalar);
+    *cPtr++ = (lv_cimag(temp) * invScalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a16_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_a16.h b/volk/include/volk/volk_8s_convert_16s_a16.h
deleted file mode 100644
index 38efdb6a3..000000000
--- a/volk/include/volk/volk_8s_convert_16s_a16.h
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef INCLUDED_volk_8s_convert_16s_a16_H
-#define INCLUDED_volk_8s_convert_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_16s_a16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_load_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_store_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_store_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_convert_16s_a16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-  */
-extern void volk_8s_convert_16s_a16_orc_impl(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points);
-static inline void volk_8s_convert_16s_a16_orc(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    volk_8s_convert_16s_a16_orc_impl(outputVector, inputVector, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_ua16.h b/volk/include/volk/volk_8s_convert_16s_ua16.h
deleted file mode 100644
index a726bfb5e..000000000
--- a/volk/include/volk/volk_8s_convert_16s_ua16.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef INCLUDED_volk_8s_convert_16s_ua16_H
-#define INCLUDED_volk_8s_convert_16s_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_16s_ua16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-
-    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
-    __m128i* outputVectorPtr = (__m128i*)outputVector;
-    __m128i inputVal;
-    __m128i ret;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128(inputVectorPtr);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVal = _mm_srli_si128(inputVal, 8);
-      ret = _mm_cvtepi8_epi16(inputVal);
-      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
-      _mm_storeu_si128(outputVectorPtr, ret);
-
-      outputVectorPtr++;
-
-      inputVectorPtr++;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (int16_t)(inputVector[number])*256;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into 16 bit integer data
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The 16 bit output data buffer
-    \param num_points The number of data values to be converted
-    \note Input and output buffers do NOT need to be properly aligned
-  */
-static inline void volk_8s_convert_16s_ua16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
-  int16_t* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h b/volk/include/volk/volk_8s_s32f_convert_32f_a16.h
deleted file mode 100644
index 45185ac2e..000000000
--- a/volk/include/volk/volk_8s_s32f_convert_32f_a16.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef INCLUDED_volk_8s_s32f_convert_32f_a16_H
-#define INCLUDED_volk_8s_s32f_convert_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_s32f_convert_32f_a16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1(iScalar);
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_store_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-static inline void volk_8s_s32f_convert_32f_a16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-#if LV_HAVE_ORC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-  */
-extern void volk_8s_s32f_convert_32f_a16_orc_impl(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points);
-static inline void volk_8s_s32f_convert_32f_a16_orc(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    volk_8s_s32f_convert_32f_a16_orc_impl(outputVector, inputVector, scalar, num_points);
-}
-#endif /* LV_HAVE_ORC */
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h b/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
deleted file mode 100644
index 310824580..000000000
--- a/volk/include/volk/volk_8s_s32f_convert_32f_ua16.h
+++ /dev/null
@@ -1,94 +0,0 @@
-#ifndef INCLUDED_volk_8s_s32f_convert_32f_ua16_H
-#define INCLUDED_volk_8s_s32f_convert_32f_ua16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8s_s32f_convert_32f_ua16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-    unsigned int number = 0;
-    const unsigned int sixteenthPoints = num_points / 16;
-    
-    float* outputVectorPtr = outputVector;
-    const float iScalar = 1.0 / scalar;
-    __m128 invScalar = _mm_set_ps1( iScalar );
-    const int8_t* inputVectorPtr = inputVector;
-    __m128 ret;
-    __m128i inputVal;
-    __m128i interimVal;
-
-    for(;number < sixteenthPoints; number++){
-      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
-
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVal = _mm_srli_si128(inputVal, 4);
-      interimVal = _mm_cvtepi8_epi32(inputVal);
-      ret = _mm_cvtepi32_ps(interimVal);
-      ret = _mm_mul_ps(ret, invScalar);
-      _mm_storeu_ps(outputVectorPtr, ret);
-      outputVectorPtr += 4;
-
-      inputVectorPtr += 16;
-    }
-
-    number = sixteenthPoints * 16;
-    for(; number < num_points; number++){
-      outputVector[number] = (float)(inputVector[number]) * iScalar;
-    }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-  /*!
-    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
-    \param inputVector The 8 bit input data buffer
-    \param outputVector The floating point output data buffer
-    \param scalar The value divided against each point in the output buffer
-    \param num_points The number of data values to be converted
-    \note Output buffer does NOT need to be properly aligned
-  */
-static inline void volk_8s_s32f_convert_32f_ua16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
-  float* outputVectorPtr = outputVector;
-  const int8_t* inputVectorPtr = inputVector;
-  unsigned int number = 0;
-  const float iScalar = 1.0 / scalar;
-
-  for(number = 0; number < num_points; number++){
-    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h b/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
deleted file mode 100644
index eae1185ec..000000000
--- a/volk/include/volk/volk_8sc_8sc_multiply_conjugate_16sc_a16.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H
-#define INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  lv_16sc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
-    
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
-    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
-      
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-      
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
-
-    a += 4;
-    b += 4;
-    c += 4;
-  }
-    
-  number = quarterPoints * 4;
-  int16_t* c16Ptr = (int16_t*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_multiply_conjugate_16sc_a16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
-  unsigned int number = 0;
-  int16_t* c16Ptr = (int16_t*)cVector;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number =0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-
-    *c16Ptr++ = (int16_t)lv_creal(temp);
-    *c16Ptr++ = (int16_t)lv_cimag(temp);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_8sc_multiply_conjugate_16sc_a16_H */
diff --git a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h b/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
deleted file mode 100644
index 621276b08..000000000
--- a/volk/include/volk/volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16.h
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifndef INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H
-#define INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_complex.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;
-
-  __m128i x, y, realz, imagz;
-  __m128 ret;
-  lv_32fc_t* c = cVector;
-  const lv_8sc_t* a = aVector;
-  const lv_8sc_t* b = bVector;
-  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
-  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-
-  for(;number < quarterPoints; number++){
-    // Convert into 8 bit values into 16 bit values
-    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
-    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
-
-    // Calculate the ar*cr - ai*(-ci) portions
-    realz = _mm_madd_epi16(x,y);
-
-    // Calculate the complex conjugate of the cr + ci j values
-    y = _mm_sign_epi16(y, conjugateSign);
-
-    // Shift the order of the cr and ci values
-    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
-
-    // Calculate the ar*(-ci) + cr*(ai)
-    imagz = _mm_madd_epi16(x,y);
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    // Interleave real and imaginary and then convert to float values
-    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
-
-    // Normalize the floating point values
-    ret = _mm_mul_ps(ret, invScalar);
-
-    // Store the floating point values
-    _mm_store_ps((float*)c, ret);
-    c += 2;
-
-    a += 4;
-    b += 4;
-  }
-
-  number = quarterPoints * 4;
-  float* cFloatPtr = (float*)&cVector[number];
-  int8_t* a8Ptr = (int8_t*)&aVector[number];
-  int8_t* b8Ptr = (int8_t*)&bVector[number];
-  for(; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-    
-    *cFloatPtr++ = lv_creal(temp) / scalar;
-    *cFloatPtr++ = lv_cimag(temp) / scalar;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
-  \param cVector The complex vector where the results will be stored
-  \param aVector One of the complex vectors to be multiplied
-  \param bVector The complex vector which will be converted to complex conjugate and multiplied
-  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
-*/
-static inline void volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  float* cPtr = (float*)cVector;
-  const float invScalar = 1.0 / scalar;
-  int8_t* a8Ptr = (int8_t*)aVector;
-  int8_t* b8Ptr = (int8_t*)bVector;
-  for(number = 0; number < num_points; number++){
-    float aReal =  (float)*a8Ptr++;
-    float aImag =  (float)*a8Ptr++;
-    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
-    float bReal = (float)*b8Ptr++;
-    float bImag = (float)*b8Ptr++;
-    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
-    lv_32fc_t temp = aVal * bVal;
-    
-    *cPtr++ = (lv_creal(temp) * invScalar);
-    *cPtr++ = (lv_cimag(temp) * invScalar);
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_8sc_s32f_multiply_conjugate_32fc_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
deleted file mode 100644
index 6a35e969d..000000000
--- a/volk/include/volk/volk_8sc_deinterleave_16s_16s_a16.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H
-#define INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_16s_16s_a16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-  __m128i complexVal, iOutputVal, qOutputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
-    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
-
-    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
-    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
-
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
-    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
-
-    iBufferPtr += 8;
-    qBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_16s_16s_a16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  int16_t* qBufferPtr = qBuffer;
-  unsigned int number;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_deinterleave_16s_16s_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
deleted file mode 100644
index 67ffebd99..000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_16s_a16.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef INCLUDED_volk_8sc_deinterleave_real_16s_a16_H
-#define INCLUDED_volk_8sc_deinterleave_real_16s_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_16s_a16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i complexVal, outputVal;
-
-  unsigned int eighthPoints = num_points / 8;
-
-  for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    outputVal = _mm_cvtepi8_epi16(complexVal);
-    outputVal = _mm_slli_epi16(outputVal, 7);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 8;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_16s_a16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  int16_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_deinterleave_real_16s_a16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
deleted file mode 100644
index ecffc092e..000000000
--- a/volk/include/volk/volk_8sc_deinterleave_real_8s_a16.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
-#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSSE3
-#include <tmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_8s_a16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
-  __m128i complexVal1, complexVal2, outputVal;
-
-  unsigned int sixteenthPoints = num_points / 16;
-
-  for(number = 0; number < sixteenthPoints; number++){
-    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
-
-    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
-    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
-
-    outputVal = _mm_or_si128(complexVal1, complexVal2);
-
-    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
-    iBufferPtr += 16;
-  }
-
-  number = sixteenthPoints * 16;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_SSSE3 */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_deinterleave_real_8s_a16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (int8_t*)complexVector;
-  int8_t* iBufferPtr = iBuffer;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = *complexVectorPtr++;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
deleted file mode 100644
index cedbf202c..000000000
--- a/volk/include/volk/volk_8sc_s32f_deinterleave_32f_32f_a16.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#ifndef INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H
-#define INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;    
-  __m128 iFloatValue, qFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
-    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    iComplexVal = _mm_srli_si128(iComplexVal, 4);
-
-    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-    _mm_store_ps(iBufferPtr, iFloatValue);
-    iBufferPtr += 4;
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-    qBufferPtr += 4;
-
-    qComplexVal = _mm_srli_si128(qComplexVal, 4);
-
-    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
-    qFloatValue = _mm_cvtepi32_ps(qIntVal);
-    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
-    _mm_store_ps(qBufferPtr, qFloatValue);
-
-    qBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 cplxValue1, cplxValue2, iValue, qValue;
-
-  __m128 invScalar = _mm_set_ps1(1.0/scalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  float floatBuffer[8] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(complexVectorPtr[0]);
-    floatBuffer[1] = (float)(complexVectorPtr[1]);
-    floatBuffer[2] = (float)(complexVectorPtr[2]);
-    floatBuffer[3] = (float)(complexVectorPtr[3]);
-      
-    floatBuffer[4] = (float)(complexVectorPtr[4]);
-    floatBuffer[5] = (float)(complexVectorPtr[5]);
-    floatBuffer[6] = (float)(complexVectorPtr[6]);
-    floatBuffer[7] = (float)(complexVectorPtr[7]);
-
-    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
-    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
-
-    complexVectorPtr += 8;
-
-    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
-    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
-
-    // Arrange in i1i2i3i4 format
-    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
-    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
-
-    _mm_store_ps(iBufferPtr, iValue);
-    _mm_store_ps(qBufferPtr, qValue);
-
-    iBufferPtr += 4;
-    qBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  complexVectorPtr = (int8_t*)&complexVector[number];
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
-  }
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param qBuffer The Q buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_32f_32f_a16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  float* qBufferPtr = qBuffer;
-  unsigned int number;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_s32f_deinterleave_32f_32f_a16_H */
diff --git a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h b/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
deleted file mode 100644
index 902795131..000000000
--- a/volk/include/volk/volk_8sc_s32f_deinterleave_real_32f_a16.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H
-#define INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H
-
-#include <inttypes.h>
-#include <stdio.h>
-
-#if LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int eighthPoints = num_points / 8;    
-  __m128 iFloatValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  __m128i complexVal, iIntVal;
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
-
-  for(;number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
-    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
-
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-
-    complexVal = _mm_srli_si128(complexVal, 4);
-    iIntVal = _mm_cvtepi8_epi32(complexVal);
-    iFloatValue = _mm_cvtepi32_ps(iIntVal);
-
-    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iFloatValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = eighthPoints * 8;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#if LV_HAVE_SSE
-#include <xmmintrin.h>
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_real_32f_a16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  float* iBufferPtr = iBuffer;
-
-  unsigned int number = 0;
-  const unsigned int quarterPoints = num_points / 4;    
-  __m128 iValue;
-
-  const float iScalar= 1.0 / scalar;
-  __m128 invScalar = _mm_set_ps1(iScalar);
-  int8_t* complexVectorPtr = (int8_t*)complexVector;
-
-  float floatBuffer[4] __attribute__((aligned(128)));
-
-  for(;number < quarterPoints; number++){
-    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
-    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
-
-    iValue = _mm_load_ps(floatBuffer);
-
-    iValue = _mm_mul_ps(iValue, invScalar);
-
-    _mm_store_ps(iBufferPtr, iValue);
-
-    iBufferPtr += 4;
-  }
-
-  number = quarterPoints * 4;
-  for(; number < num_points; number++){
-    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
-    complexVectorPtr++;
-  }
-    
-}
-#endif /* LV_HAVE_SSE */
-
-#if LV_HAVE_GENERIC
-/*!
-  \brief Deinterleaves the complex 8 bit vector into I float vector data
-  \param complexVector The complex input vector
-  \param iBuffer The I buffer output data
-  \param scalar The scaling value being multiplied against each data point
-  \param num_points The number of complex data values to be deinterleaved
-*/
-static inline void volk_8sc_s32f_deinterleave_real_32f_a16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
-  unsigned int number = 0;
-  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
-  float* iBufferPtr = iBuffer;
-  const float invScalar = 1.0 / scalar;
-  for(number = 0; number < num_points; number++){
-    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
-    complexVectorPtr++;
-  }
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-
-
-#endif /* INCLUDED_volk_8sc_s32f_deinterleave_real_32f_a16_H */
diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py
index fc1ec10ef..bc8f959af 100755
--- a/volk/include/volk/volk_register.py
+++ b/volk/include/volk/volk_register.py
@@ -55,7 +55,7 @@ functions = [];
 
 
 for line in mfile:
-    subline = re.search(".*(a16).*", line);
+    subline = re.search(".*_(a16|u)\.h.*", line);
     if subline:
         subsubline = re.search("(?<=volk_).*", subline.group(0));
         if subsubline:
@@ -70,7 +70,7 @@ datatypes = set(datatypes);
 for line in mfile:
     for dt in datatypes:
         if dt in line:
-            subline = re.search("(volk_" + dt +"_.*(a16).*\.h)", line);
+            subline = re.search("(volk_" + dt +"_.*(a16|u).*\.h)", line);
             if subline:
                 
                 subsubline = re.search(".+(?=\.h)", subline.group(0));
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index e73b70985..4c151bd6f 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -19,7 +19,8 @@ float uniform() {
   return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
 }
 
-void random_floats (float *buf, unsigned n)
+template <class t>
+void random_floats (t *buf, unsigned n)
 {
   for (unsigned i = 0; i < n; i++)
     buf[i] = uniform ();
@@ -28,8 +29,8 @@ void random_floats (float *buf, unsigned n)
 void load_random_data(void *data, volk_type_t type, unsigned int n) {
     if(type.is_complex) n *= 2;
     if(type.is_float) {
-        assert(type.size == 4); //TODO: double support
-        random_floats((float *)data, n);
+        if(type.size == 8) random_floats<double>((double *)data, n);
+        else random_floats<float>((float *)data, n);
     } else {
         float int_max = pow(2, type.size*8);
         if(type.is_signed) int_max /= 2.0;
@@ -54,7 +55,7 @@ void load_random_data(void *data, volk_type_t type, unsigned int n) {
                 else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
             break;
             default:
-                throw; //no shenanigans here
+                throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
             }
         }
     }
@@ -94,6 +95,9 @@ static std::vector<std::string> get_arch_list(const int archs[]) {
         case (1<<LV_SSE2):
             archlist.push_back("sse2");
             break;
+        case (1<<LV_SSE3):
+            archlist.push_back("sse3");
+            break;
         case (1<<LV_SSSE3):
             archlist.push_back("ssse3");
             break;
@@ -128,7 +132,7 @@ volk_type_t volk_type_from_string(std::string name) {
     type.size = 0;
     type.str = name;
     
-    assert(name.size() > 1);
+    if(name.size() < 2) throw std::string("name too short to be a datatype");
     
     //is it a scalar?
     if(name[0] == 's') { 
@@ -138,7 +142,7 @@ volk_type_t volk_type_from_string(std::string name) {
     
     //get the data size
     int last_size_pos = name.find_last_of("0123456789");
-    if(last_size_pos < 0) throw 0;
+    if(last_size_pos < 0) throw std::string("no size spec in type ").append(name);
     //will throw if malformed
     int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
 
@@ -182,12 +186,14 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
     //ok. we're assuming a string in the form
     //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
 
-    enum { SIDE_INPUT, SIDE_OUTPUT } side = SIDE_INPUT;
+    enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
     std::string fn_name;
     volk_type_t type;
     BOOST_FOREACH(std::string token, toked) {
         try {
             type = volk_type_from_string(token);
+            if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
+            
             if(side == SIDE_INPUT) inputsig.push_back(type);
             else outputsig.push_back(type);
         } catch (...){
@@ -201,9 +207,11 @@ static void get_signatures_from_name(std::vector<volk_type_t> &inputsig,
                 }
             }
             else if(side == SIDE_INPUT) { //it's the function name, at least it better be
-                side = SIDE_OUTPUT;
-                fn_name = token;
-            } else {
+                side = SIDE_NAME;
+                fn_name.append("_");
+                fn_name.append(token);
+            } 
+            else if(side == SIDE_OUTPUT) {
                 if(token != toked.back()) throw; //the last token in the name is the alignment
             }
         }
@@ -236,20 +244,40 @@ inline void run_cast_test2_s32f(volk_fn_2arg_s32f func, void *outbuff, std::vect
     while(iter--) func(outbuff, inbuffs[0], scalar, vlen, arch.c_str());
 }
 
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func, void *outbuff, std::vector<void *> &inbuffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
+    while(iter--) func(outbuff, inbuffs[0], inbuffs[1], scalar, vlen, arch.c_str());
+}
+
 template <class t>
 bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
+    bool fail = false;
+    int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
-        if(fabs(((t *)(in1))[i] - ((t *)(in2))[i]) > tol) return 1;
+        if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
+            fail=true;
+            if(print_max_errs-- > 0) {
+                std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
+            }
+        }
     }
-    return 0;
+    
+    return fail;
 }
 
 template <class t>
-bool icompare(t *in1, t *in2, unsigned int vlen) {
+bool icompare(t *in1, t *in2, unsigned int vlen, float tol) {
+    bool fail = false;
+    int print_max_errs = 10;
     for(int i=0; i<vlen; i++) {
-        if(((t *)(in1))[i] != ((t *)(in2))[i]) return 1;
+        if(((t *)(in1))[i] != ((t *)(in2))[i]) {
+            fail=true;
+            if(print_max_errs-- > 0) {
+                std::cout << "offset " << i << " in1: " << int(((t *)(in1))[i]) << " in2: " << int(((t *)(in2))[i]) << std::endl;
+            }
+        }
     }
-    return 0;
+    
+    return fail;
 }
 
 bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name, float tol, int vlen, int iter) {
@@ -300,7 +328,7 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
             load_random_data(inbuffs[i], inputsig[i], vlen);        
         }
     }
-    
+
     //now run the test
     clock_t start, end;
     for(int i = 0; i < arch_list.size(); i++) {
@@ -311,18 +339,22 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
                 if(inputsc.size() == 0) {
                     run_cast_test1((volk_fn_1arg)(manual_func), outbuffs[i], vlen, iter, arch_list[i]); 
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 1000.0, vlen, iter, arch_list[i]);
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func), outbuffs[i], 255.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 1 arg function >1 scalars";
                 break;
             case 2:
                 if(inputsc.size() == 0) {
                     run_cast_test2((volk_fn_2arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
                 } else if(inputsc.size() == 1 && inputsc[0].is_float) {
-                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 1000.0, vlen, iter, arch_list[i]);
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]);
                 } else throw "unsupported 2 arg function >1 scalars";
                 break;
             case 3:
-                run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                if(inputsc.size() == 0) {
+                    run_cast_test3((volk_fn_3arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
+                } else if(inputsc.size() == 1 && inputsc[0].is_float) {
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func), outbuffs[i], inbuffs, 255.0, vlen, iter, arch_list[i]);
+                } else throw "unsupported 3 arg function >1 scalars";
                 break;
             case 4:
                 run_cast_test4((volk_fn_4arg)(manual_func), outbuffs[i], inbuffs, vlen, iter, arch_list[i]);
@@ -337,29 +369,24 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
     }
     //and now compare each output to the generic output
     //first we have to know which output is the generic one, they aren't in order...
-    int generic_offset;
+    int generic_offset=0;
     for(int i=0; i<arch_list.size(); i++) 
         if(arch_list[i] == "generic") generic_offset=i;
-        
+
     //now compare
     if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
 
     bool fail = false;
     for(int i=0; i<arch_list.size(); i++) {
         if(i != generic_offset) {
-            if(outputsig[0].str == "32fc") {
-                fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen*2, tol);
-            } else if(outputsig[0].str == "32f") {
-                fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen, tol);
-            } else if(outputsig[0].str == "32u" || outputsig[0].str == "32s" || outputsig[0].str == "16sc") {
-                fail = icompare((uint32_t *) outbuffs[generic_offset], (uint32_t *) outbuffs[i], vlen);
-            } else if(outputsig[0].size == 2) {
-                fail = icompare((uint16_t *) outbuffs[generic_offset], (uint16_t *) outbuffs[i], vlen);
-            } else if(outputsig[0].size == 1) {
-                fail = icompare((uint8_t *) outbuffs[generic_offset], (uint8_t *) outbuffs[i], vlen);
-            } else { 
-                std::cout << "Error: invalid type " << outputsig[0].str << std::endl;
-                fail = true;
+            if(outputsig[0].is_float) {
+                if(outputsig[0].size == 8) {
+                    fail = fcompare((double *) outbuffs[generic_offset], (double *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                } else {
+                    fail = fcompare((float *) outbuffs[generic_offset], (float *) outbuffs[i], vlen*(outputsig[0].is_complex ? 2 : 1), tol);
+                }
+            } else {
+                fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
             }
             if(fail) {
                 std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
@@ -367,12 +394,6 @@ bool run_volk_tests(const int archs[], void (*manual_func)(), std::string name,
         }
     }
 
-//    BOOST_FOREACH(void *buf, inbuffs) {
-//        free(buf);
-//    }
-//    BOOST_FOREACH(void *buf, outbuffs) {
-//        free(buf);
-//    }
     return fail;
 }
 
diff --git a/volk/lib/qa_utils.h b/volk/lib/qa_utils.h
index 79c5d7778..79fc8f006 100644
--- a/volk/lib/qa_utils.h
+++ b/volk/lib/qa_utils.h
@@ -28,5 +28,6 @@ typedef void (*volk_fn_3arg)(void *, void *, void *, unsigned int, const char*);
 typedef void (*volk_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
 typedef void (*volk_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
 typedef void (*volk_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
 
 #endif //VOLK_QA_UTILS_H
diff --git a/volk/orc/Makefile.am b/volk/orc/Makefile.am
index 43f38543c..6b5e4f8b6 100644
--- a/volk/orc/Makefile.am
+++ b/volk/orc/Makefile.am
@@ -25,27 +25,28 @@ lib_LTLIBRARIES = libvolk_orc.la
 libvolk_orc_la_LDFLAGS = $(ORC_LDFLAGS)
 
 libvolk_orc_la_SOURCES = \
-volk_8s_convert_16s_a16_orc_impl.orc \
-volk_8s_s32f_convert_32f_a16_orc_impl.orc \
+volk_8i_convert_16i_a16_orc_impl.orc \
+volk_8i_s32f_convert_32f_a16_orc_impl.orc \
 volk_16u_byteswap_a16_orc_impl.orc \
-volk_32s_32s_and_32s_a16_orc_impl.orc \
-volk_32s_32s_or_32s_a16_orc_impl.orc \
-volk_32f_32f_add_32f_a16_orc_impl.orc \
-volk_32f_32f_subtract_32f_a16_orc_impl.orc \
-volk_32f_32f_divide_32f_a16_orc_impl.orc \
-volk_32f_32f_multiply_32f_a16_orc_impl.orc \
-volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc \
+volk_32i_x2_and_32i_a16_orc_impl.orc \
+volk_32i_x2_or_32i_a16_orc_impl.orc \
+volk_32f_x2_add_32f_a16_orc_impl.orc \
+volk_32f_x2_subtract_32f_a16_orc_impl.orc \
+volk_32f_x2_divide_32f_a16_orc_impl.orc \
+volk_32f_x2_multiply_32f_a16_orc_impl.orc \
+volk_32fc_x2_multiply_32fc_a16_orc_impl.orc \
 volk_32fc_32f_multiply_32fc_a16_orc_impl.orc \
 volk_32f_sqrt_32f_a16_orc_impl.orc \
-volk_32f_32f_max_32f_a16_orc_impl.orc \
-volk_32f_32f_min_32f_a16_orc_impl.orc \
+volk_32f_x2_max_32f_a16_orc_impl.orc \
+volk_32f_x2_min_32f_a16_orc_impl.orc \
 volk_32f_s32f_normalize_a16_orc_impl.orc \
 volk_32fc_magnitude_32f_a16_orc_impl.orc \
-volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc \
-volk_16sc_magnitude_16s_a16_orc_impl.orc \
-volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc \
-volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc \
-volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
+volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc \
+volk_16ic_magnitude_16i_a16_orc_impl.orc \
+volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc \
+volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc \
+volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
+
 
 
 
diff --git a/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc b/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc
new file mode 100644
index 000000000..0189fbf5d
--- /dev/null
+++ b/volk/orc/volk_16i_s32f_deinterleave_32f_x2_a16_orc_impl.orc
@@ -0,0 +1,12 @@
+.function volk_16ic_s32f_deinterleave_32f_x2_a16_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
diff --git a/volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc b/volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc
new file mode 100644
index 000000000..56018edda
--- /dev/null
+++ b/volk/orc/volk_16ic_deinterleave_16i_x2_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_16ic_deinterleave_16i_x2_a16_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
diff --git a/volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc b/volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
new file mode 100644
index 000000000..dba9a4c8e
--- /dev/null
+++ b/volk/orc/volk_16ic_deinterleave_real_8i_a16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_16ic_deinterleave_real_8i_a16_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
diff --git a/volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc b/volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc
new file mode 100644
index 000000000..37225e9b8
--- /dev/null
+++ b/volk/orc/volk_16ic_magnitude_16i_a16_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_16ic_magnitude_16i_a16_orc_impl
+.source 4 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+x2 mulf prodiqf, iqf, iqf
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+convfl rootl, rootf
+convlw dst, rootl
diff --git a/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
deleted file mode 100644
index d396a0052..000000000
--- a/volk/orc/volk_16sc_deinterleave_16s_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_16sc_deinterleave_16s_16s_a16_orc_impl
-.dest 2 idst
-.dest 2 qdst
-.source 4 src
-splitlw qdst, idst, src
diff --git a/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc b/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
deleted file mode 100644
index 5954c406f..000000000
--- a/volk/orc/volk_16sc_deinterleave_real_8s_a16_orc_impl.orc
+++ /dev/null
@@ -1,6 +0,0 @@
-.function volk_16sc_deinterleave_real_8s_a16_orc_impl
-.dest 1 dst
-.source 4 src
-.temp 2 iw
-select0lw iw, src
-convhwb dst, iw
diff --git a/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
deleted file mode 100644
index 2a49d4ecb..000000000
--- a/volk/orc/volk_16sc_magnitude_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,23 +0,0 @@
-.function volk_16sc_magnitude_16s_a16_orc_impl
-.source 4 src
-.dest 2 dst
-.floatparam 4 scalar
-.temp 8 iql
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-.temp 4 rootf
-.temp 4 rootl
-
-x2 convswl iql, src
-x2 convlf iqf, iql
-x2 divf iqf, iqf, scalar
-x2 mulf prodiqf, iqf, iqf
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf rootf, sumf
-mulf rootf, rootf, scalar
-convfl rootl, rootf
-convlw dst, rootl
diff --git a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
index 6d2ed8197..1e2380837 100644
--- a/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
+++ b/volk/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
@@ -1,4 +1,4 @@
-.function volk_16sc_magnitude_32f_aligned16_orc_impl
+.function volk_16ic_magnitude_32f_a16_orc_impl
 .source 4 src
 .dest 4 dst
 .floatparam 4 scalar
diff --git a/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc b/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
deleted file mode 100644
index 47c3d28a9..000000000
--- a/volk/orc/volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,12 +0,0 @@
-.function volk_16sc_s32f_deinterleave_32f_32f_a16_orc_impl
-.dest 4 idst
-.dest 4 qdst
-.source 4 src
-.floatparam 4 scalar
-.temp 8 iql
-.temp 8 iqf
-
-x2 convswl iql, src
-x2 convlf iqf, iql
-x2 divf iqf, iqf, scalar
-splitql qdst, idst, iqf
diff --git a/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
deleted file mode 100644
index e6a30cf01..000000000
--- a/volk/orc/volk_32f_32f_add_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_add_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-addf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
deleted file mode 100644
index 0bdcd0010..000000000
--- a/volk/orc/volk_32f_32f_divide_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_divide_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-divf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
deleted file mode 100644
index 9584e6634..000000000
--- a/volk/orc/volk_32f_32f_max_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_max_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
deleted file mode 100644
index 47b9c05db..000000000
--- a/volk/orc/volk_32f_32f_min_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_min_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-minf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
deleted file mode 100644
index e5a049c16..000000000
--- a/volk/orc/volk_32f_32f_multiply_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_multiply_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc b/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
deleted file mode 100644
index 2ab42d5f6..000000000
--- a/volk/orc/volk_32f_32f_subtract_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32f_32f_subtract_32f_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-subf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..8d095a052
--- /dev/null
+++ b/volk/orc/volk_32f_x2_add_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_add_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..0097646cb
--- /dev/null
+++ b/volk/orc/volk_32f_x2_divide_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_divide_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..b7f008737
--- /dev/null
+++ b/volk/orc/volk_32f_x2_max_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_max_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..78328b576
--- /dev/null
+++ b/volk/orc/volk_32f_x2_min_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_min_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..e8fadff19
--- /dev/null
+++ b/volk/orc/volk_32f_x2_multiply_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_multiply_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
diff --git a/volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc b/volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..13fbe8c83
--- /dev/null
+++ b/volk/orc/volk_32f_x2_subtract_32f_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_subtract_32f_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
diff --git a/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
deleted file mode 100644
index ed928b90f..000000000
--- a/volk/orc/volk_32fc_32fc_multiply_32fc_a16_orc_impl.orc
+++ /dev/null
@@ -1,6 +0,0 @@
-.function volk_32fc_32fc_multiply_32fc_a16_orc_impl
-.source 8 src1
-.source 8 src2
-.dest 8 dst
-.temp 8 tmp
-x2 mulf dst, src1, src2
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc
new file mode 100644
index 000000000..9e2599084
--- /dev/null
+++ b/volk/orc/volk_32fc_s32f_magnitude_16i_a16_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_32fc_s32f_magnitude_16i_a16_orc_impl
+.source 8 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+.temp 4 maskl
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+cmpltf maskl, scalar, rootf
+andl maskl, maskl, 0x80000000
+orl rootf, rootf, maskl
+convfl rootl, rootf
+convssslw dst, rootl
diff --git a/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc b/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
deleted file mode 100644
index cccda8a0f..000000000
--- a/volk/orc/volk_32fc_s32f_magnitude_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,23 +0,0 @@
-.function volk_32fc_s32f_magnitude_16s_a16_orc_impl
-.source 8 src
-.dest 2 dst
-.floatparam 4 scalar
-.temp 8 iqf
-.temp 8 prodiqf
-.temp 4 qf
-.temp 4 if
-.temp 4 sumf
-.temp 4 rootf
-.temp 4 rootl
-.temp 4 maskl
-
-x2 mulf prodiqf, src, src
-splitql qf, if, prodiqf
-addf sumf, if, qf
-sqrtf rootf, sumf
-mulf rootf, rootf, scalar
-cmpltf maskl, scalar, rootf
-andl maskl, maskl, 0x80000000
-orl rootf, rootf, maskl
-convfl rootl, rootf
-convssslw dst, rootl
diff --git a/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
new file mode 100644
index 000000000..d23892880
--- /dev/null
+++ b/volk/orc/volk_32fc_x2_multiply_32fc_a16_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_32fc_x2_multiply_32fc_a16_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 tmp
+x2 mulf dst, src1, src2
diff --git a/volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc b/volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc
new file mode 100644
index 000000000..7b331f8ed
--- /dev/null
+++ b/volk/orc/volk_32i_x2_and_32i_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_and_32i_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+andl dst, src1, src2
diff --git a/volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc b/volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc
new file mode 100644
index 000000000..4984a9ced
--- /dev/null
+++ b/volk/orc/volk_32i_x2_or_32i_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_or_32i_a16_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
diff --git a/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
deleted file mode 100644
index bff3af875..000000000
--- a/volk/orc/volk_32s_32s_and_32s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32s_32s_and_32s_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-andl dst, src1, src2
diff --git a/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc b/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
deleted file mode 100644
index b6961f79e..000000000
--- a/volk/orc/volk_32s_32s_or_32s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_32s_32s_or_32s_a16_orc_impl
-.dest 4 dst
-.source 4 src1
-.source 4 src2
-orl dst, src1, src2
diff --git a/volk/orc/volk_8i_convert_16i_a16_orc_impl.orc b/volk/orc/volk_8i_convert_16i_a16_orc_impl.orc
new file mode 100644
index 000000000..f44845c88
--- /dev/null
+++ b/volk/orc/volk_8i_convert_16i_a16_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_8i_convert_16i_a16_orc_impl
+.source 1 src
+.dest 2 dst
+convsbw dst, src
+shlw dst, dst, 8
diff --git a/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
new file mode 100644
index 000000000..4e33f7b3b
--- /dev/null
+++ b/volk/orc/volk_8i_s32f_convert_32f_a16_orc_impl.orc
@@ -0,0 +1,9 @@
+.function volk_8i_s32f_convert_32f_a16_orc_impl
+.source 2 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 flsrc
+.temp 4 lsrc
+convswl lsrc, src
+convlf flsrc, lsrc
+mulf dst, flsrc, scalar
diff --git a/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc b/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
deleted file mode 100644
index a55c7f723..000000000
--- a/volk/orc/volk_8s_convert_16s_a16_orc_impl.orc
+++ /dev/null
@@ -1,5 +0,0 @@
-.function volk_8s_convert_16s_a16_orc_impl
-.source 1 src
-.dest 2 dst
-convsbw dst, src
-shlw dst, dst, 8
diff --git a/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc b/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc
deleted file mode 100644
index 3274ab9d6..000000000
--- a/volk/orc/volk_8s_s32f_convert_32f_a16_orc_impl.orc
+++ /dev/null
@@ -1,9 +0,0 @@
-.function volk_8s_s32f_convert_32f_a16_orc_impl
-.source 2 src
-.dest 4 dst
-.floatparam 4 scalar
-.temp 4 flsrc
-.temp 4 lsrc
-convswl lsrc, src
-convlf flsrc, lsrc
-mulf dst, flsrc, scalar
-- 
cgit