346 files changed, 25748 insertions, 0 deletions
diff --git a/volk/AUTHORS b/volk/AUTHORS
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/volk/AUTHORS
diff --git a/volk/COPYING b/volk/COPYING
new file mode 100644
index 000000000..94a9ed024
--- /dev/null
+++ b/volk/COPYING
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/volk/ChangeLog b/volk/ChangeLog
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/volk/ChangeLog
diff --git a/volk/INSTALL b/volk/INSTALL
new file mode 100644
index 000000000..23e5f25d0
--- /dev/null
+++ b/volk/INSTALL
@@ -0,0 +1,236 @@
+Installation Instructions
+*************************
+
+Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005 Free
+Software Foundation, Inc.
+
+This file is free documentation; the Free Software Foundation gives
+unlimited permission to copy, distribute and modify it.
+
+Basic Installation
+==================
+
+These are generic installation instructions.
+
+   The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation.  It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions.  Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+   It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring.  (Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.)
+
+   If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release.  If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+   The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'.  You only need
+`configure.ac' if you want to change it or regenerate `configure' using
+a newer version of `autoconf'.
+
+The simplest way to compile this package is:
+
+  1. `cd' to the directory containing the package's source code and type
+     `./configure' to configure the package for your system.  If you're
+     using `csh' on an old version of System V, you might need to type
+     `sh ./configure' instead to prevent `csh' from trying to execute
+     `configure' itself.
+
+     Running `configure' takes awhile.  While running, it prints some
+     messages telling which features it is checking for.
+
+  2. Type `make' to compile the package.
+
+  3. Optionally, type `make check' to run any self-tests that come with
+     the package.
+
+  4. Type `make install' to install the programs and any data files and
+     documentation.
+
+  5. You can remove the program binaries and object files from the
+     source code directory by typing `make clean'.  To also remove the
+     files that `configure' created (so you can compile the package for
+     a different kind of computer), type `make distclean'.  There is
+     also a `make maintainer-clean' target, but that is intended mainly
+     for the package's developers.  If you use it, you may have to get
+     all sorts of other programs in order to regenerate files that came
+     with the distribution.
+
+Compilers and Options
+=====================
+
+Some systems require unusual options for compilation or linking that the
+`configure' script does not know about.  Run `./configure --help' for
+details on some of the pertinent environment variables.
+
+   You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment.  Here
+is an example:
+
+     ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix
+
+   *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory.  To do this, you must use a version of `make' that
+supports the `VPATH' variable, such as GNU `make'.  `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script.  `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.
+
+   If you have to use a `make' that does not support the `VPATH'
+variable, you have to compile the package for one architecture at a
+time in the source code directory.  After you have installed the
+package for one architecture, use `make distclean' before reconfiguring
+for another architecture.
+
+Installation Names
+==================
+
+By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc.  You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX'.
+
+   You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files.  If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+   In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files.  Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.
+
+   If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+Optional Features
+=================
+
+Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System).  The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+   For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+Specifying the System Type
+==========================
+
+There may be some features `configure' cannot figure out automatically,
+but needs to determine by the type of machine the package will run on.
+Usually, assuming the package is built to be run on the _same_
+architectures, `configure' can figure that out, but if it prints a
+message saying it cannot guess the machine type, give it the
+`--build=TYPE' option.  TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+     CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+     OS KERNEL-OS
+
+   See the file `config.sub' for the possible values of each field.  If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+   If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+   If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+If you want to set default values for `configure' scripts to share, you
+can create a site shell script called `config.site' that gives default
+values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists.  Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+Variables not defined in a site shell script can be set in the
+environment passed to `configure'.  However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost.  In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'.  For example:
+
+     ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).  Here is a another example:
+
+     /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+Here the `CONFIG_SHELL=/bin/bash' operand causes subsequent
+configuration-related scripts to be executed by `/bin/bash'.
+
+`configure' Invocation
+======================
+
+`configure' recognizes the following options to control how it operates.
+
+`--help'
+`-h'
+     Print a summary of the options to `configure', and exit.
+
+`--version'
+`-V'
+     Print the version of Autoconf used to generate the `configure'
+     script, and exit.
+
+`--cache-file=FILE'
+     Enable the cache: use and save the results of the tests in FILE,
+     traditionally `config.cache'.  FILE defaults to `/dev/null' to
+     disable caching.
+
+`--config-cache'
+`-C'
+     Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+     Do not print messages saying which checks are being made.  To
+     suppress all normal output, redirect it to `/dev/null' (any error
+     messages will still be shown).
+
+`--srcdir=DIR'
+     Look for the package's source code in directory DIR.  Usually
+     `configure' can determine that directory automatically.
+
+`configure' also accepts some other, not widely useful, options.  Run
+`configure --help' for more details.
+
diff --git a/volk/Makefile.am b/volk/Makefile.am
new file mode 100644
index 000000000..d2ef3dad4
--- /dev/null
+++ b/volk/Makefile.am
@@ -0,0 +1,56 @@
+#
+# Copyright 2004,2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+# 
+
+ACLOCAL_AMFLAGS = -I config
+
+include $(top_srcdir)/Makefile.common
+
+EXTRA_DIST = bootstrap configure config.h.in
+SUBDIRS = config include lib
+#if USE_PYTHON
+#SUBDIRS += python 
+#endif
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = volk.pc
+
+distclean-local:
+	-rm -f config/lv_set_simd_flags.m4
+	-rm -rf autom4te.cache
+	-rm -f config.*
+	-rm -f depcomp
+	-rm -f install-sh
+	-rm -f ltmain.sh
+	-rm -f py-compile
+	-rm -f missing
+	-rm -f volk_config.h.in
+	-rm -f aclocal.m4
+	-rm -f Makefile.in
+	-rm -f config/lt*
+	-rm -f config/libtool*
+	-rm -f apps/Makefile.in
+	-rm -f config/Makefile.in
+	-rm -f data/Makefile.in
+	-rm -f doc/Makefile.in
+	-rm -f include/Makefile.in
+	-rm -f lib/Makefile.in
+	-rm -f python/Makefile.in
+	-rm -f configure
+\ No newline at end of file
diff --git a/volk/Makefile.common b/volk/Makefile.common
new file mode 100644
index 000000000..3b028b147
--- /dev/null
+++ b/volk/Makefile.common
@@ -0,0 +1,46 @@
+# -*- Makefile -*-
+#
+# Copyright 2004,2006,2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+# 
+
+ourincludedir   = $(includedir)/volk
+
+# swig includes 
+ourswigincludedir = $(ourincludedir)/swig
+
+# Install this stuff in the appropriate subdirectory
+# This usually ends up at:
+#   ${prefix}/lib/python${python_version}/site-packages/libvector
+
+ourpythondir = $(pythondir)/volk
+ourpyexecdir = $(pyexecdir)/volk
+
+
+# swig flags
+SWIGPYTHONFLAGS = -fvirtual -python -modern
+SWIGGRFLAGS = -I$(GNURADIO_CORE_INCLUDEDIR)/swig -I$(GNURADIO_CORE_INCLUDEDIR)
+
+# standard defins and includes
+STD_DEFINES_AND_INCLUDES=-I$(top_srcdir)/include -I$(top_srcdir)/lib  $(GNURADIO_CORE_CPPFLAGS)
+
+# Don't assume that make predefines $(RM), because BSD make does
+# not. We define it now in configure.ac using AM_PATH_PROG, but now
+# here have to add a -f to be like GNU make.
+RM=$(RM_PROG) -f
diff --git a/volk/NEWS b/volk/NEWS
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/volk/NEWS
diff --git a/volk/README b/volk/README
new file mode 100644
index 000000000..c21b99220
--- /dev/null
+++ b/volk/README
@@ -0,0 +1 @@
+Follow the install instructions.  After ./configure, read LIBVECTOR_MANUAL.  
+\ No newline at end of file
diff --git a/volk/apps/Makefile.am b/volk/apps/Makefile.am
new file mode 100644
index 000000000..a51823971
--- /dev/null
+++ b/volk/apps/Makefile.am
@@ -0,0 +1,27 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+# C++  stuff here
+
+if USE_PYTHON
+# python stuff here
+endif
diff --git a/volk/bootstrap b/volk/bootstrap
new file mode 100755
index 000000000..ff239c88c
--- /dev/null
+++ b/volk/bootstrap
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+# Copyright 2001,2005,2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+rm -fr config.cache autom4te*.cache
+
+cd include/volk && chmod +x volk_register.py && ./volk_register.py && cd ../..
+aclocal -I config
+autoconf
+autoheader
+libtoolize --automake
+automake --add-missing -Wno-portability -Wno-override -Wnone
+
diff --git a/volk/config/Makefile.am b/volk/config/Makefile.am
new file mode 100644
index 000000000..b7dc2c161
--- /dev/null
+++ b/volk/config/Makefile.am
@@ -0,0 +1,60 @@
+#
+# Copyright 2001 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+# 
+
+include $(top_srcdir)/Makefile.common
+
+# Install m4 macros in this directory
+m4datadir = $(datadir)/aclocal
+
+# List your m4 macros here
+m4macros = \
+	acx_pthread.m4 \
+	bnv_have_qt.m4 \
+	cppunit.m4 \
+	gr_check_shm_open.m4 \
+	gr_lib64.m4 \
+	gr_libgnuradio_core_extra_ldflags.m4 \
+	gr_no_undefined.m4 \
+	gr_omnithread.m4 \
+	gr_pwin32.m4 \
+	gr_set_md_cpu.m4 \
+	lv_configure.m4 \
+	gr_subversion.m4 \
+	gr_sysv_shm.m4 \
+	lf_cc.m4 \
+	lf_cxx.m4 \
+	lf_warnings.m4 \
+	lv_lf_warnings.m4 \
+	lf_x11.m4 \
+	lv_set_simd_flags.m4 \
+	lv_set_lv_arch.m4 \
+	mkstemp.m4 \
+	onceonly.m4 \
+	pkg.m4 \
+	gcc_version_workaround.m4
+
+
+
+
+# Don't install m4 macros anymore
+# m4data_DATA = $(m4macros) 
+
+EXTRA_DIST = $(m4macros) 
diff --git a/volk/config/acx_pthread.m4 b/volk/config/acx_pthread.m4
new file mode 100644
index 000000000..eb09f5acc
--- /dev/null
+++ b/volk/config/acx_pthread.m4
@@ -0,0 +1,275 @@
+# ===========================================================================
+#              http://autoconf-archive.cryp.to/acx_pthread.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
+#
+# DESCRIPTION
+#
+#   This macro figures out how to build C programs using POSIX threads. It
+#   sets the PTHREAD_LIBS output variable to the threads library and linker
+#   flags, and the PTHREAD_CFLAGS output variable to any special C compiler
+#   flags that are needed. (The user can also force certain compiler
+#   flags/libs to be tested by setting these environment variables.)
+#
+#   Also sets PTHREAD_CC to any special C compiler that is needed for
+#   multi-threaded programs (defaults to the value of CC otherwise). (This
+#   is necessary on AIX to use the special cc_r compiler alias.)
+#
+#   NOTE: You are assumed to not only compile your program with these flags,
+#   but also link it with them as well. e.g. you should link with
+#   $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS
+#
+#   If you are only building threads programs, you may wish to use these
+#   variables in your default LIBS, CFLAGS, and CC:
+#
+#          LIBS="$PTHREAD_LIBS $LIBS"
+#          CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+#          CC="$PTHREAD_CC"
+#
+#   In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant
+#   has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name
+#   (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
+#
+#   ACTION-IF-FOUND is a list of shell commands to run if a threads library
+#   is found, and ACTION-IF-NOT-FOUND is a list of commands to run it if it
+#   is not found. If ACTION-IF-FOUND is not specified, the default action
+#   will define HAVE_PTHREAD.
+#
+#   Please let the authors know if this macro fails on any platform, or if
+#   you have any other suggestions or comments. This macro was based on work
+#   by SGJ on autoconf scripts for FFTW (http://www.fftw.org/) (with help
+#   from M. Frigo), as well as ac_pthread and hb_pthread macros posted by
+#   Alejandro Forero Cuervo to the autoconf macro repository. We are also
+#   grateful for the helpful feedback of numerous users.
+#
+# LAST MODIFICATION
+#
+#   2008-04-12
+#
+# COPYLEFT
+#
+#   Copyright (c) 2008 Steven G. Johnson <stevenj@alum.mit.edu>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Macro Archive. When you make and
+#   distribute a modified version of the Autoconf Macro, you may extend this
+#   special exception to the GPL to apply to your modified version as well.
+
+AC_DEFUN([ACX_PTHREAD], [
+AC_REQUIRE([AC_CANONICAL_HOST])
+AC_LANG_SAVE
+AC_LANG_C
+acx_pthread_ok=no
+
+# We used to check for pthread.h first, but this fails if pthread.h
+# requires special compiler flags (e.g. on True64 or Sequent).
+# It gets checked for in the link test anyway.
+
+# First of all, check if the user has set any of the PTHREAD_LIBS,
+# etcetera environment variables, and if threads linking works using
+# them:
+if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
+        AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test x"$acx_pthread_ok" = xno; then
+                PTHREAD_LIBS=""
+                PTHREAD_CFLAGS=""
+        fi
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+fi
+
+# We must check for the threads library under a number of different
+# names; the ordering is very important because some systems
+# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
+# libraries is broken (non-POSIX).
+
+# Create a list of thread flags to try.  Items starting with a "-" are
+# C compiler flags, and other items are library names, except for "none"
+# which indicates that we try without any flags at all, and "pthread-config"
+# which is a program returning the flags for the Pth emulation library.
+
+acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
+
+# The ordering *is* (sometimes) important.  Some notes on the
+# individual items follow:
+
+# pthreads: AIX (must check this before -lpthread)
+# none: in case threads are in libc; should be tried before -Kthread and
+#       other compiler flags to prevent continual compiler warnings
+# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
+# -pthreads: Solaris/gcc
+# -mthreads: Mingw32/gcc, Lynx/gcc
+# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+#      doesn't hurt to check since this sometimes defines pthreads too;
+#      also defines -D_REENTRANT)
+#      ... -mt is also the pthreads flag for HP/aCC
+# pthread: Linux, etcetera
+# --thread-safe: KAI C++
+# pthread-config: use pthread-config program (for GNU Pth library)
+
+case "${host_cpu}-${host_os}" in
+        *solaris*)
+
+        # On Solaris (at least, for some versions), libc contains stubbed
+        # (non-functional) versions of the pthreads routines, so link-based
+        # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+        # a function called by this macro, so we could check for that, but
+        # who knows whether they'll stub that too in a future libc.)  So,
+        # we'll just look for -pthreads and -lpthread first:
+
+        acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
+        ;;
+esac
+
+if test x"$acx_pthread_ok" = xno; then
+for flag in $acx_pthread_flags; do
+
+        case $flag in
+                none)
+                AC_MSG_CHECKING([whether pthreads work without any flags])
+                ;;
+
+                -*)
+                AC_MSG_CHECKING([whether pthreads work with $flag])
+                PTHREAD_CFLAGS="$flag"
+                ;;
+
+		pthread-config)
+		AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
+		if test x"$acx_pthread_config" = xno; then continue; fi
+		PTHREAD_CFLAGS="`pthread-config --cflags`"
+		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
+		;;
+
+                *)
+                AC_MSG_CHECKING([for the pthreads library -l$flag])
+                PTHREAD_LIBS="-l$flag"
+                ;;
+        esac
+
+        save_LIBS="$LIBS"
+        save_CFLAGS="$CFLAGS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Check for various functions.  We must include pthread.h,
+        # since some functions may be macros.  (On the Sequent, we
+        # need a special flag -Kthread to make this header compile.)
+        # We check for pthread_join because it is in -lpthread on IRIX
+        # while pthread_create is in libc.  We check for pthread_attr_init
+        # due to DEC craziness with -lpthreads.  We check for
+        # pthread_cleanup_push because it is one of the few pthread
+        # functions on Solaris that doesn't have a non-functional libc stub.
+        # We try pthread_create on general principles.
+        AC_TRY_LINK([#include <pthread.h>],
+                    [pthread_t th; pthread_join(th, 0);
+                     pthread_attr_init(0); pthread_cleanup_push(0, 0);
+                     pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
+                    [acx_pthread_ok=yes])
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        AC_MSG_RESULT($acx_pthread_ok)
+        if test "x$acx_pthread_ok" = xyes; then
+                break;
+        fi
+
+        PTHREAD_LIBS=""
+        PTHREAD_CFLAGS=""
+done
+fi
+
+# Various other checks:
+if test "x$acx_pthread_ok" = xyes; then
+        save_LIBS="$LIBS"
+        LIBS="$PTHREAD_LIBS $LIBS"
+        save_CFLAGS="$CFLAGS"
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+
+        # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
+	AC_MSG_CHECKING([for joinable pthread attribute])
+	attr_name=unknown
+	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
+	    AC_TRY_LINK([#include <pthread.h>], [int attr=$attr; return attr;],
+                        [attr_name=$attr; break])
+	done
+        AC_MSG_RESULT($attr_name)
+        if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
+            AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
+                               [Define to necessary symbol if this constant
+                                uses a non-standard name on your system.])
+        fi
+
+        AC_MSG_CHECKING([if more special flags are required for pthreads])
+        flag=no
+        case "${host_cpu}-${host_os}" in
+            *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";;
+            *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";;
+        esac
+        AC_MSG_RESULT(${flag})
+        if test "x$flag" != xno; then
+            PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
+        fi
+
+        LIBS="$save_LIBS"
+        CFLAGS="$save_CFLAGS"
+
+        # More AIX lossage: must compile with xlc_r or cc_r
+	if test x"$GCC" != xyes; then
+          AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
+        else
+          PTHREAD_CC=$CC
+	fi
+else
+        PTHREAD_CC="$CC"
+fi
+
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
+AC_SUBST(PTHREAD_CC)
+
+# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
+if test x"$acx_pthread_ok" = xyes; then
+        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
+        :
+else
+        acx_pthread_ok=no
+        $2
+fi
+AC_LANG_RESTORE
+])dnl ACX_PTHREAD
diff --git a/volk/config/bnv_have_qt.m4 b/volk/config/bnv_have_qt.m4
new file mode 100644
index 000000000..1469bfbfd
--- /dev/null
+++ b/volk/config/bnv_have_qt.m4
@@ -0,0 +1,404 @@
+dnl Available from the GNU Autoconf Macro Archive at:
+dnl http://www.gnu.org/software/ac-archive/htmldoc/bnv_have_qt.html
+dnl
+AC_DEFUN([BNV_HAVE_QT],
+[
+  dnl THANKS! This code includes bug fixes by:
+  dnl Tim McClarren.
+
+  AC_REQUIRE([AC_PROG_CXX])
+  AC_REQUIRE([AC_PATH_X])
+  AC_REQUIRE([AC_PATH_XTRA])
+
+  AC_MSG_CHECKING(for Qt)
+
+  AC_ARG_WITH([Qt-dir],
+    [  --with-Qt-dir=DIR       DIR is equal to \$QTDIR if you have followed the
+                          installation instructions of Trolltech. Header
+                          files are in DIR/include, binary utilities are
+                          in DIR/bin and the library is in DIR/lib])
+  AC_ARG_WITH([Qt-include-dir],
+    [  --with-Qt-include-dir=DIR
+                          Qt header files are in DIR])
+  AC_ARG_WITH([Qt-bin-dir],
+    [  --with-Qt-bin-dir=DIR   Qt utilities such as moc and uic are in DIR])
+  AC_ARG_WITH([Qt-lib-dir],
+    [  --with-Qt-lib-dir=DIR   The Qt library is in DIR])
+  AC_ARG_WITH([Qt-lib],
+    [  --with-Qt-lib=LIB       Use -lLIB to link with the Qt library])
+  if test x"$with_Qt_dir" = x"no" ||
+     test x"$with_Qt_include-dir" = x"no" ||
+     test x"$with_Qt_bin_dir" = x"no" ||
+     test x"$with_Qt_lib_dir" = x"no" ||
+     test x"$with_Qt_lib" = x"no"; then
+    # user disabled Qt. Leave cache alone.
+    have_qt="User disabled Qt."
+  else
+    # "yes" is a bogus option
+    if test x"$with_Qt_dir" = xyes; then
+      with_Qt_dir=
+    fi
+    if test x"$with_Qt_include_dir" = xyes; then
+      with_Qt_include_dir=
+    fi
+    if test x"$with_Qt_bin_dir" = xyes; then
+      with_Qt_bin_dir=
+    fi
+    if test x"$with_Qt_lib_dir" = xyes; then
+      with_Qt_lib_dir=
+    fi
+    if test x"$with_Qt_lib" = xyes; then
+      with_Qt_lib=
+    fi
+    # No Qt unless we discover otherwise
+    have_qt=no
+    # Check whether we are requested to link with a specific version
+    if test x"$with_Qt_lib" != x; then
+      bnv_qt_lib="$with_Qt_lib"
+    fi
+    # Check whether we were supplied with an answer already
+    if test x"$with_Qt_dir" != x; then
+      have_qt=yes
+      bnv_qt_dir="$with_Qt_dir"
+      bnv_qt_include_dir="$with_Qt_dir/include"
+      bnv_qt_bin_dir="$with_Qt_dir/bin"
+      bnv_qt_lib_dir="$with_Qt_dir/lib"
+      # Only search for the lib if the user did not define one already
+      if test x"$bnv_qt_lib" = x; then
+        bnv_qt_lib="`ls $bnv_qt_lib_dir/libqt* | sed -n 1p |
+                     sed s@$bnv_qt_lib_dir/lib@@ | [sed s@[.].*@@]`"
+      fi
+      bnv_qt_LIBS="-L$bnv_qt_lib_dir -l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+    else
+      # Use cached value or do search, starting with suggestions from
+      # the command line
+      AC_CACHE_VAL(bnv_cv_have_qt,
+      [
+        # We are not given a solution and there is no cached value.
+        bnv_qt_dir=NO
+        bnv_qt_include_dir=NO
+        bnv_qt_lib_dir=NO
+        if test x"$bnv_qt_lib" = x; then
+          bnv_qt_lib=NO
+        fi
+        BNV_PATH_QT_DIRECT
+        if test "$bnv_qt_dir" = NO ||
+           test "$bnv_qt_include_dir" = NO ||
+           test "$bnv_qt_lib_dir" = NO ||
+           test "$bnv_qt_lib" = NO; then
+          # Problem with finding complete Qt.  Cache the known absence of Qt.
+          bnv_cv_have_qt="have_qt=no"
+        else
+          # Record where we found Qt for the cache.
+          bnv_cv_have_qt="have_qt=yes                  \
+                       bnv_qt_dir=$bnv_qt_dir          \
+               bnv_qt_include_dir=$bnv_qt_include_dir  \
+                   bnv_qt_bin_dir=$bnv_qt_bin_dir      \
+                      bnv_qt_LIBS=\"$bnv_qt_LIBS\""
+        fi
+      ])dnl
+      eval "$bnv_cv_have_qt"
+    fi # all $bnv_qt_* are set
+  fi   # $have_qt reflects the system status
+  if test x"$have_qt" = xyes; then
+    QT_CXXFLAGS="-I$bnv_qt_include_dir"
+    QT_DIR="$bnv_qt_dir"
+    QT_LIBS="$bnv_qt_LIBS"
+    # If bnv_qt_dir is defined, utilities are expected to be in the
+    # bin subdirectory
+    if test x"$bnv_qt_dir" != x; then
+        if test -x "$bnv_qt_dir/bin/uic"; then
+          QT_UIC="$bnv_qt_dir/bin/uic"
+        else
+          # Old versions of Qt don't have uic
+          QT_UIC=
+        fi
+      QT_MOC="$bnv_qt_dir/bin/moc"
+    else
+      # Or maybe we are told where to look for the utilities
+      if test x"$bnv_qt_bin_dir" != x; then
+        if test -x "$bnv_qt_bin_dir/uic"; then
+          QT_UIC="$bnv_qt_bin_dir/uic"
+        else
+          # Old versions of Qt don't have uic
+          QT_UIC=
+        fi
+        QT_MOC="$bnv_qt_bin_dir/moc"
+      else
+      # Last possibility is that they are in $PATH
+        QT_UIC="`which uic`"
+        QT_MOC="`which moc`"
+      fi
+    fi
+    # All variables are defined, report the result
+    AC_MSG_RESULT([$have_qt:
+    QT_CXXFLAGS=$QT_CXXFLAGS
+    QT_DIR=$QT_DIR
+    QT_LIBS=$QT_LIBS
+    QT_UIC=$QT_UIC
+    QT_MOC=$QT_MOC])
+  else
+    # Qt was not found
+    QT_CXXFLAGS=
+    QT_DIR=
+    QT_LIBS=
+    QT_UIC=
+    QT_MOC=
+    AC_MSG_RESULT($have_qt)
+  fi
+  AC_SUBST(QT_CXXFLAGS)
+  AC_SUBST(QT_DIR)
+  AC_SUBST(QT_LIBS)
+  AC_SUBST(QT_UIC)
+  AC_SUBST(QT_MOC)
+
+  #### Being paranoid:
+  if test x"$have_qt" = xyes; then
+    AC_MSG_CHECKING(correct functioning of Qt installation)
+    AC_CACHE_VAL(bnv_cv_qt_test_result,
+    [
+      cat > bnv_qt_test.h << EOF
+#include <qobject.h>
+class Test : public QObject
+{
+Q_OBJECT
+public:
+  Test() {}
+  ~Test() {}
+public slots:
+  void receive() {}
+signals:
+  void send();
+};
+EOF
+
+      cat > bnv_qt_main.$ac_ext << EOF
+#include "bnv_qt_test.h"
+#include <qapplication.h>
+int main( int argc, char **argv )
+{
+  QApplication app( argc, argv );
+  Test t;
+  QObject::connect( &t, SIGNAL(send()), &t, SLOT(receive()) );
+}
+EOF
+
+      bnv_cv_qt_test_result="failure"
+      bnv_try_1="$QT_MOC bnv_qt_test.h -o moc_bnv_qt_test.$ac_ext >/dev/null 2>bnv_qt_test_1.out"
+      AC_TRY_EVAL(bnv_try_1)
+      bnv_err_1=`grep -v '^ *+' bnv_qt_test_1.out | grep -v "^bnv_qt_test.h\$"`
+      if test x"$bnv_err_1" != x; then
+        echo "$bnv_err_1" >&AC_FD_CC
+        echo "configure: could not run $QT_MOC on:" >&AC_FD_CC
+        cat bnv_qt_test.h >&AC_FD_CC
+      else
+        bnv_try_2="$CXX $QT_CXXFLAGS -c $CXXFLAGS -o moc_bnv_qt_test.o moc_bnv_qt_test.$ac_ext >/dev/null 2>bnv_qt_test_2.out"
+        AC_TRY_EVAL(bnv_try_2)
+        bnv_err_2=`grep -v '^ *+' bnv_qt_test_2.out | grep -v "^bnv_qt_test.{$ac_ext}\$"`
+        if test x"$bnv_err_2" != x; then
+          echo "$bnv_err_2" >&AC_FD_CC
+          echo "configure: could not compile:" >&AC_FD_CC
+          cat bnv_qt_test.$ac_ext >&AC_FD_CC
+        else
+          bnv_try_3="$CXX $QT_CXXFLAGS -c $CXXFLAGS -o bnv_qt_main.o bnv_qt_main.$ac_ext >/dev/null 2>bnv_qt_test_3.out"
+          AC_TRY_EVAL(bnv_try_3)
+          bnv_err_3=`grep -v '^ *+' bnv_qt_test_3.out | grep -v "^bnv_qt_main.{$ac_ext}\$"`
+          if test x"$bnv_err_3" != x; then
+            echo "$bnv_err_3" >&AC_FD_CC
+            echo "configure: could not compile:" >&AC_FD_CC
+            cat bnv_qt_main.$ac_ext >&AC_FD_CC
+          else
+            bnv_try_4="$CXX $QT_LIBS $LIBS -o bnv_qt_main bnv_qt_main.o moc_bnv_qt_test.o >/dev/null 2>bnv_qt_test_4.out"
+            AC_TRY_EVAL(bnv_try_4)
+            bnv_err_4=`grep -v '^ *+' bnv_qt_test_4.out`
+            if test x"$bnv_err_4" != x; then
+              echo "$bnv_err_4" >&AC_FD_CC
+            else
+              bnv_cv_qt_test_result="success"
+            fi
+          fi
+        fi
+      fi
+    ])dnl AC_CACHE_VAL bnv_cv_qt_test_result
+    AC_MSG_RESULT([$bnv_cv_qt_test_result]);
+    if test x"$bnv_cv_qt_test_result" = "xfailure"; then
+      # working Qt was not found
+      QT_CXXFLAGS=
+      QT_DIR=
+      QT_LIBS=
+      QT_UIC=
+      QT_MOC=
+      have_qt=no
+      AC_MSG_WARN([Failed to find matching components of a complete
+                  Qt installation. Try using more options,
+                  see ./configure --help.])
+    fi
+
+    rm -f bnv_qt_test.h moc_bnv_qt_test.$ac_ext moc_bnv_qt_test.o \
+          bnv_qt_main.$ac_ext bnv_qt_main.o bnv_qt_main \
+          bnv_qt_test_1.out bnv_qt_test_2.out bnv_qt_test_3.out bnv_qt_test_4.out
+  fi
+])
+
+dnl Internal subroutine of BNV_HAVE_QT
+dnl Set bnv_qt_dir bnv_qt_include_dir bnv_qt_bin_dir bnv_qt_lib_dir bnv_qt_lib
+dnl Copyright 2001 Bastiaan N. Veelo <Bastiaan.N.Veelo@immtek.ntnu.no>
+AC_DEFUN([BNV_PATH_QT_DIRECT],
+[
+  ## Binary utilities ##
+  if test x"$with_Qt_bin_dir" != x; then
+    bnv_qt_bin_dir=$with_Qt_bin_dir
+  fi
+  ## Look for header files ##
+  if test x"$with_Qt_include_dir" != x; then
+    bnv_qt_include_dir="$with_Qt_include_dir"
+  else
+    # The following header file is expected to define QT_VERSION.
+    qt_direct_test_header=qglobal.h
+    # Look for the header file in a standard set of common directories.
+    bnv_include_path_list="
+      /usr/include
+      `ls -dr /usr/include/qt* 2>/dev/null`
+      `ls -dr /usr/lib/qt*/include 2>/dev/null`
+      `ls -dr /usr/local/qt*/include 2>/dev/null`
+      `ls -dr /opt/qt*/include 2>/dev/null`
+    "
+    for bnv_dir in $bnv_include_path_list; do
+      if test -r "$bnv_dir/$qt_direct_test_header"; then
+        bnv_dirs="$bnv_dirs $bnv_dir"
+      fi
+    done
+    # Now look for the newest in this list
+    bnv_prev_ver=0
+    for bnv_dir in $bnv_dirs; do
+      bnv_this_ver=`egrep -w '#define QT_VERSION' $bnv_dir/$qt_direct_test_header | sed s/'#define QT_VERSION'//`
+      if expr $bnv_this_ver '>' $bnv_prev_ver > /dev/null; then
+        bnv_qt_include_dir=$bnv_dir
+        bnv_prev_ver=$bnv_this_ver
+      fi
+    done
+  fi dnl Found header files.
+
+  # Are these headers located in a traditional Trolltech installation?
+  # That would be $bnv_qt_include_dir stripped from its last element:
+  bnv_possible_qt_dir=`dirname $bnv_qt_include_dir`
+  if test -x $bnv_possible_qt_dir/bin/moc &&
+     ls $bnv_possible_qt_dir/lib/libqt* > /dev/null; then
+    # Then the rest is a piece of cake
+    bnv_qt_dir=$bnv_possible_qt_dir
+    bnv_qt_bin_dir="$bnv_qt_dir/bin"
+    bnv_qt_lib_dir="$bnv_qt_dir/lib"
+    # Only look for lib if the user did not supply it already
+    if test x"$bnv_qt_lib" = xNO; then
+      bnv_qt_lib="`ls $bnv_qt_lib_dir/libqt* | sed -n 1p |
+                   sed s@$bnv_qt_lib_dir/lib@@ | [sed s@[.].*@@]`"
+    fi
+    bnv_qt_LIBS="-L$bnv_qt_lib_dir -l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+  else
+    # There is no valid definition for $QTDIR as Trolltech likes to see it
+    bnv_qt_dir=
+    ## Look for Qt library ##
+    if test x"$with_Qt_lib_dir" != x; then
+      bnv_qt_lib_dir="$with_Qt_lib_dir"
+      # Only look for lib if the user did not supply it already
+      if test x"$bnv_qt_lib" = xNO; then
+        bnv_qt_lib="`ls $bnv_qt_lib_dir/libqt* | sed -n 1p |
+                     sed s@$bnv_qt_lib_dir/lib@@ | [sed s@[.].*@@]`"
+      fi
+      bnv_qt_LIBS="-L$bnv_qt_lib_dir -l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+    else
+      # Normally, when there is no traditional Trolltech installation,
+      # the library is installed in a place where the linker finds it
+      # automatically.
+      # If the user did not define the library name, try with qt
+      if test x"$bnv_qt_lib" = xNO; then
+        bnv_qt_lib=qt
+      fi
+      qt_direct_test_header=qapplication.h
+      qt_direct_test_main="
+        int argc;
+        char ** argv;
+        QApplication app(argc,argv);
+      "
+      # See if we find the library without any special options.
+      # Don't add top $LIBS permanently yet
+      bnv_save_LIBS="$LIBS"
+      LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+      bnv_qt_LIBS="$LIBS"
+      bnv_save_CXXFLAGS="$CXXFLAGS"
+      CXXFLAGS="-I$bnv_qt_include_dir"
+      AC_TRY_LINK([#include <$qt_direct_test_header>],
+        $qt_direct_test_main,
+      [
+        # Success.
+        # We can link with no special library directory.
+        bnv_qt_lib_dir=
+      ], [
+        # That did not work. Try the multi-threaded version
+        echo "Non-critical error, please neglect the above." >&AC_FD_CC
+        bnv_qt_lib=qt-mt
+        LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+        AC_TRY_LINK([#include <$qt_direct_test_header>],
+          $qt_direct_test_main,
+        [
+          # Success.
+          # We can link with no special library directory.
+          bnv_qt_lib_dir=
+        ], [
+          # That did not work. Try the OpenGL version
+          echo "Non-critical error, please neglect the above." >&AC_FD_CC
+          bnv_qt_lib=qt-gl
+          LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+          AC_TRY_LINK([#include <$qt_direct_test_header>],
+            $qt_direct_test_main,
+          [
+            # Succes.
+            # We can link with no special library directory.
+            bnv_qt_lib_dir=
+          ], [
+            # That did not work. Maybe a library version I don't know about?
+            echo "Non-critical error, please neglect the above." >&AC_FD_CC
+            # Look for some Qt lib in a standard set of common directories.
+            bnv_dir_list="
+              `echo $bnv_qt_includes | sed ss/includess`
+              /lib
+              /usr/lib
+              /usr/local/lib
+              /opt/lib
+              `ls -dr /usr/lib/qt* 2>/dev/null`
+              `ls -dr /usr/local/qt* 2>/dev/null`
+              `ls -dr /opt/qt* 2>/dev/null`
+            "
+            for bnv_dir in $bnv_dir_list; do
+              if ls $bnv_dir/libqt*; then
+                # Gamble that it's the first one...
+                bnv_qt_lib="`ls $bnv_dir/libqt* | sed -n 1p |
+                            sed s@$bnv_dir/lib@@ | sed s/[.].*//`"
+                bnv_qt_lib_dir="$bnv_dir"
+                break
+              fi
+            done
+            # Try with that one
+            LIBS="-l$bnv_qt_lib $X_PRE_LIBS $X_LIBS -lX11 -lXext -lXmu -lXt -lXi $X_EXTRA_LIBS"
+            AC_TRY_LINK([#include <$qt_direct_test_header>],
+              $qt_direct_test_main,
+            [
+              # Succes.
+              # We can link with no special library directory.
+              bnv_qt_lib_dir=
+            ], [
+              # Leave bnv_qt_lib_dir defined
+            ])
+          ])
+        ])
+      ])
+      if test x"$bnv_qt_lib_dir" != x; then
+        bnv_qt_LIBS="-l$bnv_qt_lib_dir $LIBS"
+      else
+        bnv_qt_LIBS="$LIBS"
+      fi
+      LIBS="$bnv_save_LIBS"
+      CXXFLAGS="$bnv_save_CXXFLAGS"
+    fi dnl $with_Qt_lib_dir was not given
+  fi dnl Done setting up for non-traditional Trolltech installation
+])
diff --git a/volk/config/cppunit.m4 b/volk/config/cppunit.m4
new file mode 100644
index 000000000..0991d51ec
--- /dev/null
+++ b/volk/config/cppunit.m4
@@ -0,0 +1,80 @@
+dnl
+dnl AM_PATH_CPPUNIT(MINIMUM-VERSION, [ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl
+AC_DEFUN([AM_PATH_CPPUNIT],
+[
+
+AC_ARG_WITH(cppunit-prefix,[  --with-cppunit-prefix=PFX   Prefix where CppUnit is installed (optional)],
+            cppunit_config_prefix="$withval", cppunit_config_prefix="")
+AC_ARG_WITH(cppunit-exec-prefix,[  --with-cppunit-exec-prefix=PFX  Exec prefix where CppUnit is installed (optional)],
+            cppunit_config_exec_prefix="$withval", cppunit_config_exec_prefix="")
+
+  if test x$cppunit_config_exec_prefix != x ; then
+     cppunit_config_args="$cppunit_config_args --exec-prefix=$cppunit_config_exec_prefix"
+     if test x${CPPUNIT_CONFIG+set} != xset ; then
+        CPPUNIT_CONFIG=$cppunit_config_exec_prefix/bin/cppunit-config
+     fi
+  fi
+  if test x$cppunit_config_prefix != x ; then
+     cppunit_config_args="$cppunit_config_args --prefix=$cppunit_config_prefix"
+     if test x${CPPUNIT_CONFIG+set} != xset ; then
+        CPPUNIT_CONFIG=$cppunit_config_prefix/bin/cppunit-config
+     fi
+  fi
+
+  AC_PATH_PROG(CPPUNIT_CONFIG, cppunit-config, no)
+  cppunit_version_min=$1
+
+  AC_MSG_CHECKING(for Cppunit - version >= $cppunit_version_min)
+  no_cppunit=""
+  if test "$CPPUNIT_CONFIG" = "no" ; then
+    no_cppunit=yes
+  else
+    CPPUNIT_CFLAGS=`$CPPUNIT_CONFIG --cflags`
+    CPPUNIT_LIBS=`$CPPUNIT_CONFIG --libs`
+    cppunit_version=`$CPPUNIT_CONFIG --version`
+
+    cppunit_major_version=`echo $cppunit_version | \
+           sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\1/'`
+    cppunit_minor_version=`echo $cppunit_version | \
+           sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\2/'`
+    cppunit_micro_version=`echo $cppunit_version | \
+           sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\3/'`
+
+    cppunit_major_min=`echo $cppunit_version_min | \
+           sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\1/'`
+    cppunit_minor_min=`echo $cppunit_version_min | \
+           sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\2/'`
+    cppunit_micro_min=`echo $cppunit_version_min | \
+           sed 's/\([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\)/\3/'`
+
+    cppunit_version_proper=`expr \
+        $cppunit_major_version \> $cppunit_major_min \| \
+        $cppunit_major_version \= $cppunit_major_min \& \
+        $cppunit_minor_version \> $cppunit_minor_min \| \
+        $cppunit_major_version \= $cppunit_major_min \& \
+        $cppunit_minor_version \= $cppunit_minor_min \& \
+        $cppunit_micro_version \>= $cppunit_micro_min `
+
+    if test "$cppunit_version_proper" = "1" ; then
+      AC_MSG_RESULT([$cppunit_major_version.$cppunit_minor_version.$cppunit_micro_version])
+    else
+      AC_MSG_RESULT(no)
+      no_cppunit=yes
+    fi
+  fi
+
+  if test "x$no_cppunit" = x ; then
+     ifelse([$2], , :, [$2])     
+  else
+     CPPUNIT_CFLAGS=""
+     CPPUNIT_LIBS=""
+     ifelse([$3], , :, [$3])
+  fi
+
+  AC_SUBST(CPPUNIT_CFLAGS)
+  AC_SUBST(CPPUNIT_LIBS)
+])
+
+
+
diff --git a/volk/config/gcc_version_workaround.m4 b/volk/config/gcc_version_workaround.m4
new file mode 100644
index 000000000..b3ba0b6f3
--- /dev/null
+++ b/volk/config/gcc_version_workaround.m4
@@ -0,0 +1,49 @@
+AC_DEFUN([LV_GCC_VERSION_WORKAROUND], 
+[
+	AC_REQUIRE([LF_CONFIGURE_CXX])
+	
+	cxx_version=`$CXX --version`
+
+	cxx_major_version=`echo $cxx_version | sed 's/[[^)]]*) \([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\).*/\1/'`
+	cxx_minor_version=`echo $cxx_version | sed 's/g++ [[^)]]*) \([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\).*/\2/'`
+	cxx_micro_version=`echo $cxx_version | sed 's/g++ [[^)]]*) \([[0-9]]*\).\([[0-9]]*\).\([[0-9]]*\).*/\3/'`
+    	
+	if test ["$cxx_minor_version" -lt "3"] -o ["$cxx_major_version" -lt "4"]; then
+		cxx_proper_version="$cxx_major_version.$cxx_minor_version.$cxx_micro_version"
+		
+
+		my_arch=`uname -m`
+
+		if test "${my_arch}" = i686; then
+		   my_arch="${my_arch} i586 i486 i386"
+		fi
+		if test "${my_arch}" = i586; then
+		   my_arch = "${my_arch} i686 i386 i486"
+		fi
+		if test "${my_arch}" = i486; then
+		   my_arch = "${my_arch} i686 i386 i586"
+		fi
+		if test "${my_arch}" = i386; then
+		   my_arch = "${my_arch} i686 i586 i486"
+		fi
+		
+		for i in $my_arch
+		do
+		  if test -n "`ls /usr/include/c++/$cxx_proper_version | grep $i`"; then
+                    mystery_dir=`ls /usr/include/c++/$cxx_proper_version | grep $i`
+		    echo `ls /usr/include/c++/$cxx_proper_version | grep $i`
+		  fi
+		done
+		echo "${mystery_dir}"
+		
+		LV_CXXFLAGS="${LV_CXXFLAGS} -nostdinc++ -I/usr/include/c++/$cxx_proper_version -I/usr/include/c++/$cxx_proper_version/$mystery_dir"
+		
+		CXXFLAGS="${CXXFLAGS} -nostdinc++ -I/usr/include/c++/$cxx_proper_version -I/usr/include/c++/$cxx_proper_version/$mystery_dir"
+		
+
+	
+	fi
+    	
+	
+
+	])
+\ No newline at end of file
diff --git a/volk/config/gr_lib64.m4 b/volk/config/gr_lib64.m4
new file mode 100644
index 000000000..751f774b4
--- /dev/null
+++ b/volk/config/gr_lib64.m4
@@ -0,0 +1,85 @@
+dnl
+dnl Copyright 2005,2008 Free Software Foundation, Inc.
+dnl 
+dnl This file is part of GNU Radio
+dnl 
+dnl GNU Radio is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3, or (at your option)
+dnl any later version.
+dnl 
+dnl GNU Radio is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with GNU Radio; see the file COPYING.  If not, write to
+dnl the Free Software Foundation, Inc., 51 Franklin Street,
+dnl Boston, MA 02110-1301, USA.
+dnl 
+
+dnl GR_LIB64()
+dnl
+dnl Checks to see if we're on a x86_64 or powerpc64 machine, and if so, determine
+dnl if libdir should end in "64" or not.
+dnl
+dnl Sets gr_libdir_suffix to "" or "64" and calls AC_SUBST(gr_libdir_suffix)
+dnl May append "64" to libdir.
+dnl
+dnl The current heuristic is:
+dnl   if the host_cpu isn't x86_64 or powerpc64, then ""
+dnl   if the host_os isn't linux, then ""
+dnl   if we're cross-compiling, ask the linker, by way of the selected compiler
+dnl   if we're x86_64 and there's a /lib64 and it's not a symlink, then "64", else ""
+dnl   else ask the compiler
+dnl
+AC_DEFUN([GR_LIB64],[
+  AC_REQUIRE([AC_CANONICAL_HOST])
+  AC_REQUIRE([AC_PROG_CXX])
+
+  AC_MSG_CHECKING([gr_libdir_suffix])
+  gr_libdir_suffix=""
+  AC_SUBST(gr_libdir_suffix)
+
+  case "$host_os" in
+    linux*) is_linux=yes ;;
+    *)      is_linux=no  ;;
+  esac
+
+  if test "$is_linux" = no || test "$host_cpu" != "x86_64" && test "$host_cpu" != "powerpc64"; then
+    gr_libdir_suffix=""
+  elif test "$cross_compiling" = yes; then
+    _GR_LIB64_ASK_COMPILER
+  elif test "$host_cpu" = "x86_64"; then
+    if test -d /lib64 && test ! -L /lib64; then
+      gr_libdir_suffix=64
+    fi
+  else
+    _GR_LIB64_ASK_COMPILER  
+  fi
+  AC_MSG_RESULT([$gr_libdir_suffix])
+
+
+  AC_MSG_CHECKING([whether to append 64 to libdir])
+  t=${libdir##*/lib}
+  if test "$t" != 64 && test "$gr_libdir_suffix" = "64"; then
+    libdir=${libdir}64
+    AC_MSG_RESULT([yes. Setting libdir to $libdir])
+  else
+    AC_MSG_RESULT([no])
+  fi
+])
+
+dnl If we're using g++, extract the first SEARCH_DIR("...") entry from the linker script
+dnl and see if it contains a suffix after the final .../lib part of the path.
+dnl (This works because the linker script varies depending on whether we're generating
+dnl 32-bit or 64-bit executables)
+dnl
+AC_DEFUN([_GR_LIB64_ASK_COMPILER],[
+  if test "$ac_cv_cxx_compiler_gnu" = "yes";
+  then
+    gr_libdir_suffix=`$CXX -Wl,--verbose 2>/dev/null | sed -n -e '/SEARCH_DIR/{s/;.*$//; s,^.*/,,; s/".*$//; s/^lib//; p}'`
+  fi
+])
+
diff --git a/volk/config/gr_libgnuradio_core_extra_ldflags.m4 b/volk/config/gr_libgnuradio_core_extra_ldflags.m4
new file mode 100644
index 000000000..43f872c04
--- /dev/null
+++ b/volk/config/gr_libgnuradio_core_extra_ldflags.m4
@@ -0,0 +1,40 @@
+# Check for (MinGW)win32 extra ld options.             -*- Autoconf -*-
+
+# Copyright 2003,2004,2005 Free Software Foundation, Inc.
+#
+# This file is part of GNU Radio
+#
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+dnl
+AC_DEFUN([GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS], [
+AC_REQUIRE([AC_PROG_LD])
+# on Mingw32 extra LDFLAGS are required to ease global variable linking
+LIBGNURADIO_CORE_EXTRA_LDFLAGS=""
+
+AC_MSG_CHECKING([whether $LD accepts --enable-runtime-pseudo-reloc])
+if ${LD} --enable-runtime-pseudo-reloc --version >/dev/null 2>&1
+then
+        # libtool requires the quotes
+        LIBGNURADIO_CORE_EXTRA_LDFLAGS="\"-Wl,--enable-runtime-pseudo-reloc\""
+        AC_MSG_RESULT(yes)
+else
+        AC_MSG_RESULT(no)
+fi
+
+AC_SUBST(LIBGNURADIO_CORE_EXTRA_LDFLAGS)
+
+])
diff --git a/volk/config/gr_no_undefined.m4 b/volk/config/gr_no_undefined.m4
new file mode 100644
index 000000000..c8d745d5f
--- /dev/null
+++ b/volk/config/gr_no_undefined.m4
@@ -0,0 +1,44 @@
+dnl
+dnl Copyright 2005 Free Software Foundation, Inc.
+dnl 
+dnl This file is part of GNU Radio
+dnl 
+dnl GNU Radio is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3, or (at your option)
+dnl any later version.
+dnl 
+dnl GNU Radio is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with GNU Radio; see the file COPYING.  If not, write to
+dnl the Free Software Foundation, Inc., 51 Franklin Street,
+dnl Boston, MA 02110-1301, USA.
+dnl 
+
+# GR_NO_UNDEFINED()
+#
+# Detemine whether we need to use the -no-undefined linker flag
+# when building shared libraries.  
+# Sets NO_UNDEFINED to "" or "-no-undefined"
+# 
+# As far as I can tell, we need -no-undefined only when building
+# windows DLLs.  This occurs when using MinGW and Cygwin.
+#
+# For now, we stub this out.
+
+AC_DEFUN([GR_NO_UNDEFINED],[
+	AC_REQUIRE([AC_CANONICAL_HOST])
+	no_undefined=""
+	case "${host_os}" in
+		*mingw* | *cygwin*)
+
+		# on MinGW/Cygwin extra LDFLAGS are required
+		no_undefined="-no-undefined"
+		;;
+	esac
+	AC_SUBST(NO_UNDEFINED,[$no_undefined])
+])
diff --git a/volk/config/gr_omnithread.m4 b/volk/config/gr_omnithread.m4
new file mode 100644
index 000000000..054f07824
--- /dev/null
+++ b/volk/config/gr_omnithread.m4
@@ -0,0 +1,52 @@
+# Check for Omnithread (pthread/NT) thread support.             -*- Autoconf -*-
+
+# Copyright 2003,2007 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Boston, MA
+# 02110-1301, USA.
+
+AC_DEFUN([GR_OMNITHREAD],
+[
+  # Check first for POSIX
+  ACX_PTHREAD(
+  [ AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.])
+    ot_posix="yes"
+    DEFINES="$DEFINES -DOMNITHREAD_POSIX=1"
+  ],[
+    # If no POSIX support found, then check for NT threads
+    AC_MSG_CHECKING([for NT threads])
+
+    AC_LINK_IFELSE([
+        #include <windows.h>
+        #include <winbase.h>
+        int main() { InitializeCriticalSection(NULL); return 0; }
+      ],
+      [ 
+        ot_nt="yes"
+        DEFINES="$DEFINES -DOMNITHREAD_NT=1"
+      ],
+      [AC_MSG_FAILURE([GNU Radio requires POSIX threads.  pthreads not found.])]
+    )
+    AC_MSG_RESULT(yes)
+  ])
+  AM_CONDITIONAL(OMNITHREAD_POSIX, test "x$ot_posix" = xyes)
+  AM_CONDITIONAL(OMNITHREAD_NT, test "x$ot_nt" = xyes)
+
+  save_LIBS="$LIBS"
+  AC_SEARCH_LIBS([clock_gettime], [rt], [PTHREAD_LIBS="$PTHREAD_LIBS $LIBS"])
+  AC_CHECK_FUNCS([clock_gettime gettimeofday nanosleep])
+  LIBS="$save_LIBS"
+])
+
diff --git a/volk/config/gr_pwin32.m4 b/volk/config/gr_pwin32.m4
new file mode 100644
index 000000000..7b99cba6b
--- /dev/null
+++ b/volk/config/gr_pwin32.m4
@@ -0,0 +1,146 @@
+# Check for (mingw)win32 POSIX replacements.             -*- Autoconf -*-
+
+# Copyright 2003,2004,2005 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+
+
+AC_DEFUN([GR_PWIN32],
+[
+AC_REQUIRE([AC_HEADER_TIME])
+AC_CHECK_HEADERS([sys/types.h fcntl.h io.h])
+AC_CHECK_HEADERS([windows.h])
+AC_CHECK_HEADERS([winioctl.h winbase.h], [], [], [
+	#if HAVE_WINDOWS_H
+	#include <windows.h>
+	#endif
+])
+
+AC_CHECK_FUNCS([getopt usleep gettimeofday nanosleep rand srand random srandom sleep sigaction])
+AC_CHECK_TYPES([struct timezone, struct timespec, ssize_t],[],[],[
+     #if HAVE_SYS_TYPES_H
+     # include <sys/types.h>
+     #endif
+     #if TIME_WITH_SYS_TIME
+     # include <sys/time.h>
+     # include <time.h>
+     #else
+     # if HAVE_SYS_TIME_H
+     #  include <sys/time.h>
+     # else
+     #  include <time.h>
+     # endif
+     #endif
+])
+
+dnl Checks for replacements
+AC_REPLACE_FUNCS([getopt usleep gettimeofday])
+
+
+AC_MSG_CHECKING(for Sleep)
+AC_TRY_LINK([   #include <windows.h>
+                #include <winbase.h>
+                ], [ Sleep(0); ],
+                [AC_DEFINE(HAVE_SSLEEP,1,[Define to 1 if you have win32 Sleep])
+                AC_MSG_RESULT(yes)],
+                AC_MSG_RESULT(no)
+                )
+
+dnl Under Win32, mkdir prototype in io.h has only one arg
+AC_MSG_CHECKING(whether mkdir accepts only one arg)
+AC_TRY_COMPILE([#include <sys/types.h>
+       #include <sys/stat.h>
+       #include <fcntl.h>], [
+       mkdir("")
+ ], [ AC_MSG_RESULT(yes)
+     AC_DEFINE(MKDIR_TAKES_ONE_ARG,[],[Define if mkdir accepts only one arg]) ],
+ [ AC_MSG_RESULT(no)
+ ])
+
+AH_BOTTOM(
+[
+/* Define missing prototypes, implemented in replacement lib */
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifndef HAVE_GETOPT
+int getopt (int argc, char * const argv[], const char * optstring);
+extern char * optarg;
+extern int optind, opterr, optopt;
+#endif
+
+#ifndef HAVE_USLEEP
+int usleep(unsigned long usec);	/* SUSv2 */
+#endif
+
+#ifndef HAVE_NANOSLEEP
+#ifndef HAVE_STRUCT_TIMESPEC
+#if HAVE_SYS_TYPES_H
+# include <sys/types.h>	/* need time_t */
+#endif
+struct timespec {
+	time_t	tv_sec;
+	long	tv_nsec;
+};
+#endif
+static inline int nanosleep(const struct timespec *req, struct timespec *rem) { return usleep(req->tv_sec*1000000+req->tv_nsec/1000); }
+#endif
+
+#if defined(HAVE_SSLEEP) && !defined(HAVE_SLEEP)
+#ifdef HAVE_WINBASE_H
+#include <windows.h>
+#include <winbase.h>
+#endif
+/* TODO: what about SleepEx? */
+static inline unsigned int sleep (unsigned int nb_sec) { Sleep(nb_sec*1000); return 0; }
+#endif
+
+#ifndef HAVE_GETTIMEOFDAY
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifndef HAVE_STRUCT_TIMEZONE
+struct timezone {
+        int  tz_minuteswest;
+	int  tz_dsttime;
+};
+#endif
+int gettimeofday(struct timeval *tv, struct timezone *tz);
+#endif
+
+#if !defined(HAVE_RANDOM) && defined(HAVE_RAND)
+#include <stdlib.h>
+static inline long int random (void) { return rand(); }
+#endif
+
+#if !defined(HAVE_SRANDOM) && defined(HAVE_SRAND)
+static inline void srandom (unsigned int seed) { srand(seed); }
+#endif
+
+#ifndef HAVE_SSIZE_T
+typedef size_t ssize_t;
+#endif
+
+#ifdef  __cplusplus
+}
+#endif
+])
+
+
+])
diff --git a/volk/config/gr_set_md_cpu.m4 b/volk/config/gr_set_md_cpu.m4
new file mode 100644
index 000000000..56fd83bac
--- /dev/null
+++ b/volk/config/gr_set_md_cpu.m4
@@ -0,0 +1,63 @@
+dnl
+dnl Copyright 2003,2008 Free Software Foundation, Inc.
+dnl 
+dnl This file is part of GNU Radio
+dnl 
+dnl GNU Radio is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3, or (at your option)
+dnl any later version.
+dnl 
+dnl GNU Radio is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with GNU Radio; see the file COPYING.  If not, write to
+dnl the Free Software Foundation, Inc., 51 Franklin Street,
+dnl Boston, MA 02110-1301, USA.
+dnl 
+
+AC_DEFUN([_TRY_ADD_ALTIVEC],
+[
+  LF_CHECK_CC_FLAG([-mabi=altivec -maltivec])
+  LF_CHECK_CXX_FLAG([-mabi=altivec -maltivec])
+])
+
+AC_DEFUN([GR_SET_MD_CPU],[
+  AC_REQUIRE([AC_CANONICAL_HOST])
+  AC_ARG_WITH(md-cpu,
+	AC_HELP_STRING([--with-md-cpu=ARCH],[set machine dependent speedups (auto)]),
+		[cf_with_md_cpu="$withval"],
+		[cf_with_md_cpu="$host_cpu"])
+
+  case "$cf_with_md_cpu" in
+   x86 | i[[3-7]]86)	MD_CPU=x86	MD_SUBCPU=x86 ;;
+   x86_64)		MD_CPU=x86	MD_SUBCPU=x86_64 ;;
+   powerpc*)            MD_CPU=powerpc ;;
+   *)           	MD_CPU=generic ;;
+  esac
+
+  AC_ARG_ENABLE(altivec,
+    AC_HELP_STRING([--enable-altivec],[enable altivec on PowerPC (yes)]),
+    [ if test $MD_CPU = powerpc; then
+        case "$enableval" in
+          (no)  MD_CPU=generic ;;
+          (yes) _TRY_ADD_ALTIVEC ;;
+          (*) AC_MSG_ERROR([Invalid argument ($enableval) to --enable-altivec]) ;;
+        esac
+      fi],
+    [ if test $MD_CPU = powerpc; then _TRY_ADD_ALTIVEC fi])
+
+
+  AC_MSG_CHECKING([for machine dependent speedups])
+  AC_MSG_RESULT($MD_CPU)
+  AC_SUBST(MD_CPU)
+  AC_SUBST(MD_SUBCPU) 
+
+  AM_CONDITIONAL(MD_CPU_x86,     test "$MD_CPU" = "x86")
+  AM_CONDITIONAL(MD_SUBCPU_x86_64,  test "$MD_SUBCPU" = "x86_64")
+  AM_CONDITIONAL(MD_CPU_powerpc, test "$MD_CPU" = "powerpc")
+  AM_CONDITIONAL(MD_CPU_generic, test "$MD_CPU" = "generic")
+])
diff --git a/volk/config/gr_sysv_shm.m4 b/volk/config/gr_sysv_shm.m4
new file mode 100644
index 000000000..db5c8351e
--- /dev/null
+++ b/volk/config/gr_sysv_shm.m4
@@ -0,0 +1,36 @@
+# Check for IPC System V shm support.             -*- Autoconf -*-
+
+# Copyright 2003 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Boston, MA
+# 02110-1301, USA.
+
+AC_DEFUN([GR_SYSV_SHM],
+[
+	AC_LANG_SAVE
+	AC_LANG_C
+
+	AC_CHECK_HEADERS([sys/ipc.h sys/shm.h])
+
+        save_LIBS="$LIBS"
+	AC_SEARCH_LIBS(shmat, [cygipc ipc],
+	  [ IPC_LIBS="$LIBS" ],
+	  [ AC_MSG_WARN([SystemV IPC support not found. ]) ]
+	)
+        LIBS="$save_LIBS"
+
+	AC_LANG_RESTORE
+	AC_SUBST(IPC_LIBS)
+])
diff --git a/volk/config/lf_cc.m4 b/volk/config/lf_cc.m4
new file mode 100644
index 000000000..b75e1a4c5
--- /dev/null
+++ b/volk/config/lf_cc.m4
@@ -0,0 +1,41 @@
+dnl Autoconf support for C++
+dnl Copyright (C) 1988 Eleftherios Gkioulekas <lf@amath.washington.edu>
+dnl  
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software 
+dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA.
+dnl 
+dnl As a special exception to the GNU General Public License, if you 
+dnl distribute this file as part of a program that contains a configuration 
+dnl script generated by Autoconf, you may include it under the same 
+dnl distribution terms that you use for the rest of that program.
+
+# -------------------------------------------------------------------------
+# Use this macro to configure your C compiler
+# When called the macro does the following things:
+# 1. It finds an appropriate C compiler.
+#    If you passed the flag --with-cc=foo then it uses that
+#    particular compiler
+# 2. Check whether the compiler works.
+# 3. Checks whether the compiler accepts the -g 
+# -------------------------------------------------------------------------
+
+AC_DEFUN([LF_CONFIGURE_CC],[
+  dnl Sing the song
+  AC_REQUIRE([AC_PROG_CC])dnl
+  AC_REQUIRE([AC_PROG_CPP])dnl
+  AC_REQUIRE([AC_AIX])dnl
+  AC_REQUIRE([AC_ISC_POSIX])dnl
+  AC_REQUIRE([AC_HEADER_STDC])dnl
+])
+
diff --git a/volk/config/lf_cxx.m4 b/volk/config/lf_cxx.m4
new file mode 100644
index 000000000..dfc6bfbfe
--- /dev/null
+++ b/volk/config/lf_cxx.m4
@@ -0,0 +1,67 @@
+dnl Autoconf support for C++
+dnl Copyright (C) 1988 Eleftherios Gkioulekas <lf@amath.washington.edu>
+dnl  
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software 
+dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA.
+dnl 
+dnl As a special exception to the GNU General Public License, if you 
+dnl distribute this file as part of a program that contains a configuration 
+dnl script generated by Autoconf, you may include it under the same 
+dnl distribution terms that you use for the rest of that program.
+
+# -----------------------------------------------------------------
+# This macro should be called to configure your C++ compiler.
+# When called, the macro does the following things:
+# 1. It finds an appropriate C++ compiler
+#    If you passed the flag --with-cxx=foo, then it uses that
+#    particular compiler
+# 2. Checks whether the compiler accepts the -g 
+# ------------------------------------------------------------------
+
+AC_DEFUN([LF_CONFIGURE_CXX],[
+ AC_REQUIRE([AC_PROG_CXX])dnl
+ AC_REQUIRE([AC_PROG_CXXCPP])dnl
+ LF_CXX_PORTABILITY
+])
+
+# -----------------------------------------------------------------------
+# This macro tests the C++ compiler for various portability problem.
+# -----------------------------------------------------------------------
+
+
+AC_DEFUN([LF_CXX_PORTABILITY],[
+
+  dnl
+  dnl Check for common C++ portability problems
+  dnl
+
+  dnl AC_LANG_PUSH
+  dnl AC_LANG_CPLUSPLUS
+  AC_LANG_SAVE
+  AC_LANG_CPLUSPLUS
+
+
+  dnl Test whether C++ has std::isnan
+  AC_MSG_CHECKING(whether C++ has std::isnan)
+  AC_TRY_COMPILE([#include <cmath>], [
+   std::isnan(0);
+], [ AC_MSG_RESULT(yes)
+	AC_DEFINE(CXX_HAS_STD_ISNAN,[],[Define if has std::isnan]) ],
+   [ AC_MSG_RESULT(no) ])
+
+  dnl Done with the portability checks
+  dnl AC_LANG_POP([C++])
+  AC_LANG_RESTORE
+])
+
diff --git a/volk/config/lf_warnings.m4 b/volk/config/lf_warnings.m4
new file mode 100644
index 000000000..d40c77f14
--- /dev/null
+++ b/volk/config/lf_warnings.m4
@@ -0,0 +1,121 @@
+dnl Copyright (C) 1988 Eleftherios Gkioulekas <lf@amath.washington.edu>
+dnl Copyright (C) 2009 Free Software Foundation, Inc.
+dnl  
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software 
+dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA.
+dnl 
+dnl As a special exception to the GNU General Public License, if you 
+dnl distribute this file as part of a program that contains a configuration 
+dnl script generated by Autoconf, you may include it under the same 
+dnl distribution terms that you use for the rest of that program.
+
+# --------------------------------------------------------------------------
+# Check whether the C++ compiler accepts a certain flag
+# If it does it adds the flag to lf_CXXFLAGS
+# If it does not then it returns an error to lf_ok
+# Usage:
+#   LF_CHECK_CXX_FLAG(-flag1 -flag2 -flag3 ...)
+# -------------------------------------------------------------------------
+
+AC_DEFUN([LF_CHECK_CXX_FLAG],[
+  echo 'void f(){}' > conftest.cc
+  for i in $1
+  do
+    AC_MSG_CHECKING([whether $CXX accepts $i])
+    if test -z "`${CXX} $i -c conftest.cc 2>&1`"
+    then
+      lf_CXXFLAGS="${lf_CXXFLAGS} $i"
+      AC_MSG_RESULT(yes)
+    else
+      AC_MSG_RESULT(no)
+    fi
+  done
+  rm -f conftest.cc conftest.o
+  AC_SUBST(lf_CXXFLAGS)
+])
+
+# --------------------------------------------------------------------------
+# Check whether the C compiler accepts a certain flag
+# If it does it adds the flag to lf_CFLAGS
+# If it does not then it returns an error to lf_ok
+# Usage:
+#  LF_CHECK_CC_FLAG(-flag1 -flag2 -flag3 ...)
+# -------------------------------------------------------------------------
+
+AC_DEFUN([LF_CHECK_CC_FLAG],[
+  echo 'void f(){}' > conftest.c
+  for i in $1
+  do
+    AC_MSG_CHECKING([whether $CC accepts $i])
+    if test -z "`${CC} $i -c conftest.c 2>&1`"
+    then
+      lf_CFLAGS="${lf_CFLAGS} $i"
+      AC_MSG_RESULT(yes)
+    else
+      AC_MSG_RESULT(no)
+    fi
+  done
+  rm -f conftest.c conftest.o
+  AC_SUBST(lf_CFLAGS)
+])
+
+# --------------------------------------------------------------------------
+# Check whether the Fortran compiler accepts a certain flag
+# If it does it adds the flag to lf_FFLAGS
+# If it does not then it returns an error to lf_ok
+# Usage:
+#  LF_CHECK_F77_FLAG(-flag1 -flag2 -flag3 ...)
+# -------------------------------------------------------------------------
+
+AC_DEFUN([LF_CHECK_F77_FLAG],[
+  cat << EOF > conftest.f
+c....:++++++++++++++++++++++++
+      PROGRAM MAIN
+      PRINT*,'Hello World!'
+      END
+EOF
+  for i in $1
+  do
+    AC_MSG_CHECKING([whether $F77 accepts $i])
+    if test -z "`${F77} $i -c conftest.f 2>&1`"
+    then
+      lf_FFLAGS="${lf_FFLAGS} $i"
+      AC_MSG_RESULT(yes)  
+    else
+      AC_MSG_RESULT(no)
+    fi
+  done
+  rm -f conftest.f conftest.o
+  AC_SUBST(lf_FFLAGS)
+])
+
+# ----------------------------------------------------------------------
+# Enable compiler warnings. 
+# Call this command AFTER you have configured ALL your compilers. 
+# ----------------------------------------------------------------------
+
+AC_DEFUN([LF_SET_WARNINGS],[
+  dnl Warnings for the two main compilers
+  dnl add -Wextra when you're got time to fix a bunch of them ;-)
+  cc_warning_flags="-Wall -Werror-implicit-function-declaration"
+  cxx_warning_flags="-Wall -Woverloaded-virtual"
+  if test -n "${CC}"
+  then
+    LF_CHECK_CC_FLAG($cc_warning_flags)
+  fi
+  if test -n "${CXX}" 
+  then
+    LF_CHECK_CXX_FLAG($cxx_warning_flags)
+  fi
+])
diff --git a/volk/config/lf_x11.m4 b/volk/config/lf_x11.m4
new file mode 100644
index 000000000..460cd605f
--- /dev/null
+++ b/volk/config/lf_x11.m4
@@ -0,0 +1,39 @@
+dnl Copyright (C) 1988 Eleftherios Gkioulekas <lf@amath.washington.edu>
+dnl  
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software 
+dnl Foundation, Inc., 51 Franklin Street, Boston, MA 02110-1301, USA.
+dnl 
+dnl As a special exception to the GNU General Public License, if you 
+dnl distribute this file as part of a program that contains a configuration 
+dnl script generated by Autoconf, you may include it under the same 
+dnl distribution terms that you use for the rest of that program.
+
+ 
+#-----------------------------------------------------------------------
+# This macro searches for Xlib and when it finds it it adds the 
+# appropriate flags to CXXFLAGS and export the link sequence to 
+# the variable XLIB. 
+# In your configure.in file add:
+#   LF_PATH_XLIB
+# In your Makefile.am add
+#   program_LDADD = .... $(XLIB)
+#------------------------------------------------------------------------
+
+AC_DEFUN([LF_PATH_XLIB],[
+  AC_PATH_XTRA
+  CXXFLAGS="$CXXFLAGS $X_CFLAGS"
+  XLIB="$X_LIBS $X_PRE_LIBS -lX11 $X_EXTRA_LIBS"
+  AC_SUBST(XLIB)
+])
+
diff --git a/volk/config/lv_configure.m4 b/volk/config/lv_configure.m4
new file mode 100644
index 000000000..fc6a0a567
--- /dev/null
+++ b/volk/config/lv_configure.m4
@@ -0,0 +1,110 @@
+dnl all this stuff taken and modified from GNURADIO!
+dnl
+dnl LV_CONFIGURE
+dnl
+dnl Handles the bulk of the configure.ac work for an out-of-tree build
+dnl
+dnl You must invoke:
+dnl
+dnl   AC_INIT(package_name,version)
+dnl   AC_PREREQ(2.57)
+dnl   AC_CONFIG_AUX_DIR([.])	
+dnl
+dnl in configure.ac before LV_CONFIGURE
+dnl
+dnl
+dnl N.B., this is an m4_define because if it were an AC_DEFUN it would
+dnl get called too late to be useful.
+
+m4_define([LV_CONFIGURE],
+[
+
+  AC_CONFIG_SRCDIR([config/lv_configure.m4])
+  AC_CONFIG_AUX_DIR([.])	
+  AM_CONFIG_HEADER(config.h)
+
+  AC_CANONICAL_BUILD
+  AC_CANONICAL_HOST
+  AC_CANONICAL_TARGET
+
+  AM_INIT_AUTOMAKE(libvector,0.0svn)
+
+  LF_CONFIGURE_CC
+  LF_CONFIGURE_CXX
+  GR_LIB64		dnl check for lib64 suffix after choosing compilers
+
+  LV_GCC_VERSION_WORKAROUND
+  dnl add ${prefix}/lib${gr_libdir_suffix}/pkgconfig to the head of the PKG_CONFIG_PATH
+  if test x${PKG_CONFIG_PATH} = x; then
+      PKG_CONFIG_PATH=${prefix}/lib${gr_libdir_suffix}/pkgconfig
+  else
+      PKG_CONFIG_PATH=${prefix}/lib${gr_libdir_suffix}/pkgconfig:${PKG_CONFIG_PATH}
+  fi
+  export PKG_CONFIG_PATH
+
+  LF_SET_WARNINGS
+dnl  GR_SET_GPROF
+dnl  GR_SET_PROF
+  AM_PROG_AS
+  AC_PROG_LN_S
+  AC_PROG_MAKE_SET
+  AC_PROG_INSTALL
+  AC_PATH_PROG([RM_PROG], [rm])
+
+  AC_LIBTOOL_WIN32_DLL
+  AC_ENABLE_SHARED	dnl do build shared libraries... important for qa
+  AC_DISABLE_STATIC	dnl don't build static libraries... important for qa
+  m4_ifdef([LT_INIT],[LT_INIT],[AC_PROG_LIBTOOL])
+  dnl GR_FORTRAN
+
+  GR_NO_UNDEFINED	dnl do we need the -no-undefined linker flag
+dnl  GR_SCRIPTING		dnl Locate python, SWIG, etc
+
+dnl  AC_ARG_WITH([python],
+dnl    AC_HELP_STRING([--with-python], [Should we use python? [[default=yes]]]),
+dnl    [case "$with_python" in
+dnl      (no | yes) ;;
+dnl      (*) AC_MSG_ERROR([Invalid argument ($with_python) to --with-python]) ;;
+dnl     esac],
+dnl    [with_python=yes])
+
+dnl  AM_CONDITIONAL([USE_PYTHON], [test "$with_python" = yes])
+
+
+  dnl Set the c++ compiler that we use for the build system when cross compiling
+  if test "x$CXX_FOR_BUILD" = x
+  then
+    CXX_FOR_BUILD=${CXX}
+  fi
+  AC_SUBST(CXX_FOR_BUILD)
+
+  dnl Checks for header files.
+  AC_HEADER_STDC
+
+  dnl Checks for typedefs, structures, and compiler characteristics.
+  AC_C_CONST
+  AC_C_INLINE
+  AC_TYPE_SIZE_T
+  AC_HEADER_TIME
+  AC_C_BIGENDIAN
+
+  dnl Check for Mingw support
+  GR_PWIN32
+  GR_LIBGNURADIO_CORE_EXTRA_LDFLAGS
+  
+  LDFLAGS="$LDFLAGS $LIBGNURADIO_CORE_EXTRA_LDFLAGS"
+
+  AC_CHECK_PROG([XMLTO],[xmlto],[yes],[])
+  AM_CONDITIONAL([HAS_XMLTO], [test x$XMLTO = xyes])
+
+  dnl Define where to look for cppunit includes and libs
+  dnl sets CPPUNIT_CFLAGS and CPPUNIT_LIBS
+  dnl Try using pkg-config first, then fall back to cppunit-config.
+  PKG_CHECK_EXISTS(cppunit,
+    [PKG_CHECK_MODULES(CPPUNIT, cppunit >= 1.9.14)],
+    [AM_PATH_CPPUNIT([1.9.14],[],
+		     [AC_MSG_ERROR([LIBVECTOR requires cppunit.  Stop])])])
+
+dnl  PKG_CHECK_MODULES(GNURADIO_CORE, gnuradio-core >= 3)
+dnl  LIBS="$LIBS $GNURADIO_CORE_LIBS"
+])
diff --git a/volk/config/mkstemp.m4 b/volk/config/mkstemp.m4
new file mode 100644
index 000000000..4af0f0a9b
--- /dev/null
+++ b/volk/config/mkstemp.m4
@@ -0,0 +1,89 @@
+#serial 4
+
+# On some hosts (e.g., HP-UX 10.20, SunOS 4.1.4, Solaris 2.5.1), mkstemp has a
+# silly limit that it can create no more than 26 files from a given template.
+# Other systems lack mkstemp altogether.
+# On OSF1/Tru64 V4.0F, the system-provided mkstemp function can create
+# only 32 files per process.
+# On systems like the above, arrange to use the replacement function.
+AC_DEFUN([UTILS_FUNC_MKSTEMP],
+[dnl
+  AC_REPLACE_FUNCS(mkstemp)
+  if test $ac_cv_func_mkstemp = no; then
+    utils_cv_func_mkstemp_limitations=yes
+  else
+    AC_CACHE_CHECK([for mkstemp limitations],
+      utils_cv_func_mkstemp_limitations,
+      [
+	AC_TRY_RUN([
+#         include <stdlib.h>
+	  int main ()
+	  {
+	    int i;
+	    for (i = 0; i < 70; i++)
+	      {
+		char template[] = "conftestXXXXXX";
+		int fd = mkstemp (template);
+		if (fd == -1)
+		  exit (1);
+		close (fd);
+	      }
+	    exit (0);
+	  }
+	  ],
+	utils_cv_func_mkstemp_limitations=no,
+	utils_cv_func_mkstemp_limitations=yes,
+	utils_cv_func_mkstemp_limitations=yes
+	)
+      ]
+    )
+  fi
+
+  if test $utils_cv_func_mkstemp_limitations = yes; then
+    AC_LIBOBJ(mkstemp)
+    AC_LIBOBJ(tempname)
+    AC_DEFINE(mkstemp, rpl_mkstemp,
+      [Define to rpl_mkstemp if the replacement function should be used.])
+    gl_PREREQ_MKSTEMP
+    jm_PREREQ_TEMPNAME
+  fi
+])
+
+# Prerequisites of lib/mkstemp.c.
+AC_DEFUN([gl_PREREQ_MKSTEMP],
+[
+  AH_BOTTOM(
+  [
+  #ifndef HAVE_MKSTEMP
+  #ifdef  __cplusplus
+  extern "C" {
+  #endif
+  int rpl_mkstemp (char *templ);
+  #ifdef  __cplusplus
+  }
+  #endif
+  #endif
+  ])
+])
+
+# Prerequisites of lib/tempname.c.
+AC_DEFUN([jm_PREREQ_TEMPNAME],
+[
+  AC_REQUIRE([AC_HEADER_STAT])
+  AC_CHECK_HEADERS_ONCE(fcntl.h sys/time.h unistd.h)
+  AC_CHECK_HEADERS(stdint.h)
+  AC_CHECK_FUNCS(__secure_getenv gettimeofday lstat)
+  AC_CHECK_DECLS_ONCE(getenv)
+  # AC_REQUIRE([jm_AC_TYPE_UINTMAX_T])
+
+  dnl Under Win32, mkdir prototype in io.h has only one arg
+  AC_MSG_CHECKING(whether mkdir accepts only one arg)
+  AC_TRY_COMPILE([#include <sys/types.h>
+       #include <sys/stat.h>
+       #include <fcntl.h>], [
+       mkdir("")
+  ], [ AC_MSG_RESULT(yes)
+     AC_DEFINE(MKDIR_TAKES_ONE_ARG,[],[Define if mkdir accepts only one arg]) ],
+   [ AC_MSG_RESULT(no)
+     ])
+])
diff --git a/volk/config/onceonly.m4 b/volk/config/onceonly.m4
new file mode 100644
index 000000000..f6fec37cb
--- /dev/null
+++ b/volk/config/onceonly.m4
@@ -0,0 +1,63 @@
+# onceonly.m4 serial 3
+dnl Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+dnl This file is free software, distributed under the terms of the GNU
+dnl General Public License.  As a special exception to the GNU General
+dnl Public License, this file may be distributed as part of a program
+dnl that contains a configuration script generated by Autoconf, under
+dnl the same distribution terms as the rest of that program.
+
+dnl This file defines some "once only" variants of standard autoconf macros.
+dnl   AC_CHECK_HEADERS_ONCE         like  AC_CHECK_HEADERS
+dnl   AC_CHECK_FUNCS_ONCE           like  AC_CHECK_FUNCS
+dnl   AC_CHECK_DECLS_ONCE           like  AC_CHECK_DECLS
+dnl   AC_REQUIRE([AC_HEADER_STDC])  like  AC_HEADER_STDC
+dnl The advantage is that the check for each of the headers/functions/decls
+dnl will be put only once into the 'configure' file. It keeps the size of
+dnl the 'configure' file down, and avoids redundant output when 'configure'
+dnl is run.
+dnl The drawback is that the checks cannot be conditionalized. If you write
+dnl   if some_condition; then gl_CHECK_HEADERS(stdlib.h); fi
+dnl inside an AC_DEFUNed function, the gl_CHECK_HEADERS macro call expands to
+dnl empty, and the check will be inserted before the body of the AC_DEFUNed
+dnl function.
+
+dnl Autoconf version 2.57 or newer is recommended.
+AC_PREREQ(2.54)
+
+# AC_CHECK_HEADERS_ONCE(HEADER1 HEADER2 ...) is a once-only variant of
+# AC_CHECK_HEADERS(HEADER1 HEADER2 ...).
+AC_DEFUN([AC_CHECK_HEADERS_ONCE], [
+  :
+  AC_FOREACH([gl_HEADER_NAME], [$1], [
+    AC_DEFUN([gl_CHECK_HEADER_]m4_quote(translit(defn([gl_HEADER_NAME]),
+                                                 [-./], [___])), [
+      AC_CHECK_HEADERS(gl_HEADER_NAME)
+    ])
+    AC_REQUIRE([gl_CHECK_HEADER_]m4_quote(translit(gl_HEADER_NAME,
+                                                   [-./], [___])))
+  ])
+])
+
+# AC_CHECK_FUNCS_ONCE(FUNC1 FUNC2 ...) is a once-only variant of
+# AC_CHECK_FUNCS(FUNC1 FUNC2 ...).
+AC_DEFUN([AC_CHECK_FUNCS_ONCE], [
+  :
+  AC_FOREACH([gl_FUNC_NAME], [$1], [
+    AC_DEFUN([gl_CHECK_FUNC_]defn([gl_FUNC_NAME]), [
+      AC_CHECK_FUNCS(defn([gl_FUNC_NAME]))
+    ])
+    AC_REQUIRE([gl_CHECK_FUNC_]defn([gl_FUNC_NAME]))
+  ])
+])
+
+# AC_CHECK_DECLS_ONCE(DECL1 DECL2 ...) is a once-only variant of
+# AC_CHECK_DECLS(DECL1, DECL2, ...).
+AC_DEFUN([AC_CHECK_DECLS_ONCE], [
+  :
+  AC_FOREACH([gl_DECL_NAME], [$1], [
+    AC_DEFUN([gl_CHECK_DECL_]defn([gl_DECL_NAME]), [
+      AC_CHECK_DECLS(defn([gl_DECL_NAME]))
+    ])
+    AC_REQUIRE([gl_CHECK_DECL_]defn([gl_DECL_NAME]))
+  ])
+])
diff --git a/volk/config/pkg.m4 b/volk/config/pkg.m4
new file mode 100644
index 000000000..2d4d96109
--- /dev/null
+++ b/volk/config/pkg.m4
@@ -0,0 +1,201 @@
+# pkg.m4 - Macros to locate and utilise pkg-config.            -*- Autoconf -*-
+# 
+# Copyright © 2004 Scott James Remnant <scott@netsplit.com>.
+# Copyright © 2008 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# PKG_PROG_PKG_CONFIG([MIN-VERSION])
+# ----------------------------------
+AC_DEFUN([PKG_PROG_PKG_CONFIG],
+[m4_pattern_forbid([^_?PKG_[A-Z_]+$])
+m4_pattern_allow([^PKG_CONFIG(_PATH)?$])
+AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility])dnl
+if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then
+	AC_PATH_TOOL([PKG_CONFIG], [pkg-config])
+fi
+if test -n "$PKG_CONFIG"; then
+	_pkg_min_version=m4_default([$1], [0.18])
+	AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version])
+	if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then
+		AC_MSG_RESULT([yes])
+	else
+		AC_MSG_RESULT([no])
+		PKG_CONFIG=""
+	fi
+		
+fi[]dnl
+])# PKG_PROG_PKG_CONFIG
+
+# PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+#
+# Check to see whether a particular set of modules exists.  Similar
+# to PKG_CHECK_MODULES(), but does not set variables or print errors.
+#
+#
+# Similar to PKG_CHECK_MODULES, make sure that the first instance of
+# this or PKG_CHECK_MODULES is called, or make sure to call
+# PKG_CHECK_EXISTS manually
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_EXISTS],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+if test -n "$PKG_CONFIG" && \
+    AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then
+  m4_ifval([$2], [$2], [:])
+m4_ifvaln([$3], [else
+  $3])dnl
+fi])
+
+
+# _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES])
+# ---------------------------------------------
+m4_define([_PKG_CONFIG],
+[if test -n "$PKG_CONFIG"; then
+    if test -n "$$1"; then
+        pkg_cv_[]$1="$$1"
+    else
+        PKG_CHECK_EXISTS([$3],
+                         [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`],
+			 [pkg_failed=yes])
+    fi
+else
+	pkg_failed=untried
+fi[]dnl
+])# _PKG_CONFIG
+
+# _PKG_SHORT_ERRORS_SUPPORTED
+# -----------------------------
+AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED],
+[AC_REQUIRE([PKG_PROG_PKG_CONFIG])
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi[]dnl
+])# _PKG_SHORT_ERRORS_SUPPORTED
+
+
+# PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND],
+# [ACTION-IF-NOT-FOUND])
+#
+# E.g.,
+#   PKG_CHECK_MODULES(GSTUFF, gtk+-2.0 >= 1.3 glib = 1.3.4, action-if, action-not)
+#   defines:
+#
+#     GSTUFF_LIBS
+#     GSTUFF_CFLAGS
+#     GSTUFF_INCLUDEDIR
+#     GSTUFF_CPPFLAGS    # the -I, -D and -U's out of CFLAGS
+#
+# see pkg-config man page also defines GSTUFF_PKG_ERRORS on error
+#
+# Note that if there is a possibility the first call to
+# PKG_CHECK_MODULES might not happen, you should be sure to include an
+# explicit call to PKG_PROG_PKG_CONFIG in your configure.ac
+#
+# --------------------------------------------------------------
+AC_DEFUN([PKG_CHECK_MODULES],[
+AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+
+AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl
+AC_ARG_VAR([$1][_INCLUDEDIR], [includedir for $1, overriding pkg-config])dnl
+
+pkg_failed=no
+AC_MSG_CHECKING([for $1])
+
+_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2])
+
+if test x$cross_compiling = xyes
+then
+  dnl _PKG_CONFIG([$1][_LIBS], [libs-only-l --static], [$2])
+  _PKG_CONFIG([$1][_LIBS], [libs --static], [$2])
+  dnl prune out any -L/lib or -L/usr/lib since they're pointing to the wrong filesystem root
+  _pkg_tmp=
+  for flag in [$]pkg_cv_[$1][_LIBS]; do
+    case $flag in
+    (-L/lib* | -L/usr/lib* )	   ;; # ignore
+    (*) _pkg_tmp="$_pkg_tmp $flag" ;;
+    esac
+  done
+  pkg_cv_[$1][_LIBS]="$_pkg_tmp"
+else
+  _PKG_CONFIG([$1][_LIBS], [libs --static], [$2])
+fi
+
+_PKG_CONFIG([$1][_INCLUDEDIR], [variable=includedir], [$2])
+
+
+m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS
+and $1[]_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.])
+
+if test $pkg_failed = yes; then
+        _PKG_SHORT_ERRORS_SUPPORTED
+        if test $_pkg_short_errors_supported = yes; then
+	        $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "$2"`
+        else 
+	        $1[]_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "$2"`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD
+
+	ifelse([$4], , [AC_MSG_ERROR(dnl
+[Package requirements ($2) were not met:
+
+$$1_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+_PKG_TEXT
+])],
+		[AC_MSG_RESULT([no])
+                $4])
+elif test $pkg_failed = untried; then
+	ifelse([$4], , [AC_MSG_FAILURE(dnl
+[The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+_PKG_TEXT
+
+To get pkg-config, see <http://pkg-config.freedesktop.org/>.])],
+		[$4])
+else
+	$1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS
+	$1[]_LIBS=$pkg_cv_[]$1[]_LIBS
+	$1[]_INCLUDEDIR=$pkg_cv_[]$1[]_INCLUDEDIR
+
+	$1[]_CPPFLAGS=""
+	for flag in $$1[]_CFLAGS; do
+	  case $flag in
+          -I* | -D* | -U*) $1[]_CPPFLAGS="$$1[]_CPPFLAGS $flag" ;;
+          esac
+        done
+	pkg_cv_[]$1[]_CPPFLAGS=$$1[]_CPPFLAGS
+	AC_SUBST($1[]_CPPFLAGS)
+
+        AC_MSG_RESULT([yes])
+	ifelse([$3], , :, [$3])
+fi[]dnl
+])# PKG_CHECK_MODULES
diff --git a/volk/configure.ac b/volk/configure.ac
new file mode 100644
index 000000000..eb9fbdc55
--- /dev/null
+++ b/volk/configure.ac
@@ -0,0 +1,62 @@
+AC_INIT(libvector,0.0svn)
+AC_PREREQ(2.57)
+AC_CONFIG_AUX_DIR([.])	
+
+
+dnl This is kind of non-standard, but it sure shortens up this file :-)
+
+m4_include([config/lv_configure.m4])
+
+LV_CONFIGURE
+
+dnl Check for any libraries you need
+dnl AC_CHECK_LIBRARY
+
+dnl Check for header files you need
+dnl AC_CHECK_HEADERS(fcntl.h limits.h strings.h sys/ioctl.h sys/time.h unistd.h)
+dnl AC_CHECK_HEADERS(sys/mman.h)
+
+dnl Checks for library functions.
+dnl AC_CHECK_FUNCS([])
+
+dnl We pick up the boost cppflags, cxxflags and thread lib via GNURADIO_CORE
+dnl
+dnl If you need additional boost libraries, you'll need to
+dnl uncomment AX_BOOST_BASE, plus some of the following:
+dnl
+dnl calls AC_SUBST(BOOST_CPPFLAGS), AC_SUBST(BOOST_LDFLAGS) and defines HAVE_BOOST
+dnl AX_BOOST_BASE([1.35])
+dnl
+dnl All the rest of these call AC_SUBST(BOOST_<foo>_LIB) and define HAVE_BOOST_<foo>
+dnl
+dnl AX_BOOST_DATE_TIME
+dnl AX_BOOST_FILESYSTEM
+dnl AX_BOOST_IOSTREAMS
+dnl AX_BOOST_PROGRAM_OPTIONS
+dnl AX_BOOST_REGEX
+dnl AX_BOOST_SERIALIZATION
+dnl AX_BOOST_SIGNALS
+dnl AX_BOOST_SYSTEM
+dnl AX_BOOST_TEST_EXEC_MONITOR
+dnl AX_BOOST_UNIT_TEST_FRAMEWORK
+dnl AX_BOOST_WSERIALIZATION
+
+AC_CONFIG_HEADERS([volk_config.h])
+LV_SET_SIMD_FLAGS
+
+AC_CONFIG_FILES([\
+	  Makefile \
+	  config/Makefile \
+	  include/Makefile \
+	  include/volk/Makefile \
+	  lib/Makefile \
+	  volk.pc \
+	])
+
+
+
+
+
+AC_CONFIG_COMMANDS([run_system_cleanup], [chmod +x system_cleanup.sh && ./system_cleanup.sh $MYCPU $MYSUBCPU], [MYCPU=$MD_CPU MYSUBCPU=$MD_SUBCPU])
+
+AC_OUTPUT
diff --git a/volk/data/Makefile.am b/volk/data/Makefile.am
new file mode 100644
index 000000000..d0ef08bc9
--- /dev/null
+++ b/volk/data/Makefile.am
@@ -0,0 +1,30 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+# List your data files here.  Please keep it under ~ 100KB.
+#
+# EXTRA_DIST = \
+#     foo.dat \
+#     bar.dat \
+#     baz.dat
+
+EXTRA_DIST = 
diff --git a/volk/doc/Makefile.am b/volk/doc/Makefile.am
new file mode 100644
index 000000000..cddd72435
--- /dev/null
+++ b/volk/doc/Makefile.am
@@ -0,0 +1,32 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+# List your doc files here
+#
+# EXTRA_DIST = \
+#     on-the-meaning-of-life \
+#     whatever-doesnt-make-you-stronger-kills-you
+#     
+
+EXTRA_DIST = 
+
+# and/or run doxygen, xmlto, etc
diff --git a/volk/include/Makefile.am b/volk/include/Makefile.am
new file mode 100644
index 000000000..375d1a7d5
--- /dev/null
+++ b/volk/include/Makefile.am
@@ -0,0 +1,23 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+SUBDIRS = volk
diff --git a/volk/include/volk/Makefile.am b/volk/include/volk/Makefile.am
new file mode 100644
index 000000000..fd59ab795
--- /dev/null
+++ b/volk/include/volk/Makefile.am
@@ -0,0 +1,133 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+ourinclude_HEADERS = \
+	volk_complex.h \
+	volk_common.h \
+	volk_config_fixed.h \
+	volk_runtime.h \
+	volk_config.h \
+	volk_tables.h \
+	volk_typedefs.h \
+	volk.h \
+	volk_cpu.h \
+	volk_environment_init.h \
+	volk_16s_add_quad_aligned16.h \
+	volk_16s_branch_4_state_8_aligned16.h \
+	volk_16sc_deinterleave_16s_aligned16.h \
+	volk_16sc_deinterleave_32f_aligned16.h \
+	volk_16sc_deinterleave_real_16s_aligned16.h \
+	volk_16sc_deinterleave_real_32f_aligned16.h \
+	volk_16sc_deinterleave_real_8s_aligned16.h \
+	volk_16sc_magnitude_16s_aligned16.h \
+	volk_16sc_magnitude_32f_aligned16.h \
+	volk_16s_convert_32f_aligned16.h \
+	volk_16s_convert_32f_unaligned16.h \
+	volk_16s_convert_8s_aligned16.h \
+	volk_16s_convert_8s_unaligned16.h \
+	volk_16s_max_star_aligned16.h \
+	volk_16s_max_star_horizontal_aligned16.h \
+	volk_16s_permute_and_scalar_add_aligned16.h \
+	volk_16s_quad_max_star_aligned16.h \
+	volk_16u_byteswap_aligned16.h \
+	volk_32f_accumulator_aligned16.h \
+	volk_32f_add_aligned16.h \
+	volk_32fc_32f_multiply_aligned16.h \
+	volk_32fc_32f_power_32fc_aligned16.h \
+	volk_32f_calc_spectral_noise_floor_aligned16.h \
+	volk_32fc_atan2_32f_aligned16.h \
+	volk_32fc_conjugate_dot_prod_aligned16.h \
+	volk_32fc_deinterleave_32f_aligned16.h \
+	volk_32fc_deinterleave_64f_aligned16.h \
+	volk_32fc_deinterleave_real_16s_aligned16.h \
+	volk_32fc_deinterleave_real_32f_aligned16.h \
+	volk_32fc_deinterleave_real_64f_aligned16.h \
+	volk_32fc_dot_prod_aligned16.h \
+	volk_32fc_index_max_aligned16.h \
+	volk_32fc_magnitude_16s_aligned16.h \
+	volk_32fc_magnitude_32f_aligned16.h \
+	volk_32fc_multiply_aligned16.h \
+	volk_32f_convert_16s_aligned16.h \
+	volk_32f_convert_16s_unaligned16.h \
+	volk_32f_convert_32s_aligned16.h \
+	volk_32f_convert_32s_unaligned16.h \
+	volk_32f_convert_64f_aligned16.h \
+	volk_32f_convert_64f_unaligned16.h \
+	volk_32f_convert_8s_aligned16.h \
+	volk_32f_convert_8s_unaligned16.h \
+	volk_32fc_power_spectral_density_32f_aligned16.h \
+	volk_32fc_power_spectrum_32f_aligned16.h \
+	volk_32fc_square_dist_aligned16.h \
+	volk_32fc_square_dist_scalar_mult_aligned16.h \
+	volk_32f_divide_aligned16.h \
+	volk_32f_dot_prod_aligned16.h \
+	volk_32f_dot_prod_unaligned16.h \
+	volk_32f_fm_detect_aligned16.h \
+	volk_32f_index_max_aligned16.h \
+	volk_32f_interleave_16sc_aligned16.h \
+	volk_32f_interleave_32fc_aligned16.h \
+	volk_32f_max_aligned16.h \
+	volk_32f_min_aligned16.h \
+	volk_32f_multiply_aligned16.h \
+	volk_32f_normalize_aligned16.h \
+	volk_32f_power_aligned16.h \
+	volk_32f_sqrt_aligned16.h \
+	volk_32f_stddev_aligned16.h \
+	volk_32f_stddev_and_mean_aligned16.h \
+	volk_32f_subtract_aligned16.h \
+	volk_32f_sum_of_poly_aligned16.h \
+	volk_32s_and_aligned16.h \
+	volk_32s_convert_32f_aligned16.h \
+	volk_32s_convert_32f_unaligned16.h \
+	volk_32s_or_aligned16.h \
+	volk_32u_byteswap_aligned16.h \
+	volk_32u_popcnt_aligned16.h \
+	volk_64f_convert_32f_aligned16.h \
+	volk_64f_convert_32f_unaligned16.h \
+	volk_64f_max_aligned16.h \
+	volk_64f_min_aligned16.h \
+	volk_64u_byteswap_aligned16.h \
+	volk_64u_popcnt_aligned16.h \
+	volk_8sc_deinterleave_16s_aligned16.h \
+	volk_8sc_deinterleave_32f_aligned16.h \
+	volk_8sc_deinterleave_real_16s_aligned16.h \
+	volk_8sc_deinterleave_real_32f_aligned16.h \
+	volk_8sc_deinterleave_real_8s_aligned16.h \
+	volk_8sc_multiply_conjugate_16sc_aligned16.h \
+	volk_8sc_multiply_conjugate_32fc_aligned16.h \
+	volk_8s_convert_16s_aligned16.h \
+	volk_8s_convert_16s_unaligned16.h \
+	volk_8s_convert_32f_aligned16.h \
+	volk_8s_convert_32f_unaligned16.h 
+
+distclean-local: 
+	rm -f volk_config_fixed.h 
+	rm -f volk_config.h
+	rm -f volk_cpu.h
+	rm -f volk.h
+	rm -f volk_registry.h
+	rm -f volk_runtime.h
+	rm -f volk_typedefs.h
+	rm -f volk_tables.h
+	rm -f *.pyc
+	rm -f Makefile.in
+	rm -f volk_environment_init.h
+\ No newline at end of file
diff --git a/volk/include/volk/archs.xml b/volk/include/volk/archs.xml
new file mode 100644
index 000000000..c9c441008
--- /dev/null
+++ b/volk/include/volk/archs.xml
@@ -0,0 +1,133 @@
+<!-- archs appear in order of significance for blind, de-facto version ordering -->
+<grammar>
+
+<arch name="generic" type="all">
+  <flag>none</flag>
+</arch>
+
+<arch name="altivec" type="powerpc">
+  <flag>maltivec</flag>
+</arch>
+
+<arch name="32" type="x86">
+  <op>0x80000001</op>
+  <flag>m32</flag>
+  <reg>d</reg>
+  <shift>29</shift>
+  <val>0</val>
+</arch> 
+
+<arch name="64" type="x86">
+  <op>0x80000001</op>
+  <reg>d</reg>
+  <shift>29</shift>
+  <flag>m64</flag>
+  <val>1</val>
+</arch>
+
+<arch name="3dnow" type="x86">
+  <op>0x80000001</op>
+  <reg>d</reg>
+  <shift>31</shift>
+  <flag>m3dnow</flag>
+  <val>1</val>
+</arch>
+
+<arch name="abm" type="x86">
+  <val>1</val>
+  <op>0x80000001</op>
+  <reg>d</reg>
+  <shift>5</shift>
+  <flag>sse4.2</flag>
+</arch>
+
+<arch name="popcount" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>c</reg>
+  <shift>23</shift>
+  <flag>mpopcnt</flag>
+</arch>
+
+<arch name="mmx" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>d</reg>
+  <shift>23</shift>
+  <flag>mmmx</flag>
+</arch>
+
+
+<arch name="sse" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>d</reg>
+  <shift>25</shift>
+  <flag>msse</flag>
+  <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
+  <include>xmmintrin.h</include>
+</arch>
+
+
+<arch name="sse2" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>d</reg>
+  <shift>26</shift>
+  <flag>msse2</flag>
+</arch>
+
+<arch name="sse3" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>c</reg>
+  <shift>0</shift>
+  <flag>msse3</flag>
+  <environment>_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);</environment>
+  <include>pmmintrin.h</include>
+</arch>
+
+<arch name="ssse3" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>c</reg>
+  <shift>9</shift>
+  <flag>mssse3</flag>
+</arch>
+
+<arch name="sse4_a" type="x86">
+  <val>1</val>
+  <op>0x80000001</op>
+  <reg>c</reg>
+  <shift>6</shift>
+  <flag>msse4a</flag>
+</arch>
+
+
+<arch name="sse4_1" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>c</reg>
+  <shift>19</shift>
+  <flag>msse4.1</flag>
+</arch>
+
+<arch name="sse4_2" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>c</reg>
+  <shift>20</shift>
+  <flag>msse4.2</flag>
+</arch>
+
+
+<arch name="avx" type="x86">
+  <val>1</val>
+  <op>1</op>
+  <reg>c</reg>
+  <shift>28</shift>
+  <flag>mavx</flag>
+</arch>
+
+
+</grammar>
diff --git a/volk/include/volk/emit_omnilog.py b/volk/include/volk/emit_omnilog.py
new file mode 100644
index 000000000..309d7e578
--- /dev/null
+++ b/volk/include/volk/emit_omnilog.py
@@ -0,0 +1,13 @@
+def emit_prolog():
+    tempstring = "";
+    tempstring = tempstring + '#ifdef __cplusplus\n';
+    tempstring = tempstring + 'extern "C" {\n';
+    tempstring = tempstring + '#endif\n';
+    return tempstring;
+def emit_epilog():
+    tempstring = "";
+    tempstring = tempstring + '#ifdef __cplusplus\n';
+    tempstring = tempstring + '}\n';
+    tempstring = tempstring + '#endif\n';
+    return tempstring;
+
diff --git a/volk/include/volk/make_c.py b/volk/include/volk/make_c.py
new file mode 100644
index 000000000..08d6d949c
--- /dev/null
+++ b/volk/include/volk/make_c.py
@@ -0,0 +1,76 @@
+from xml.dom import minidom
+import string
+from volk_regexp import *
+
+
+def make_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+    
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring + '\n\n#include<volk/volk.h>\n';
+    tempstring = tempstring + '#include<volk/volk_tables.h>\n';
+    tempstring = tempstring + '#include<volk/volk_typedefs.h>\n';
+    tempstring = tempstring + '#include<volk/volk_registry.h>\n';
+    tempstring = tempstring + '#include<string.h>\n';
+    for func in funclist:
+        tempstring = tempstring + "#include<volk/" + func + ".h>\n" ;
+    tempstring = tempstring + '\n';
+
+    tempstring = tempstring + "static inline unsigned int volk_get_index(const char** indices, const char* arch, const int* arch_defs) {\n";
+    tempstring = tempstring + "    int i = 1;\n"
+    tempstring = tempstring + "    for(;i<arch_defs[0];++i){\n"
+    tempstring = tempstring + "        if (strcmp(arch, indices[i]) == 0) {\n"
+    tempstring = tempstring + "            return i;\n"
+    tempstring = tempstring + "        }\n"
+    tempstring = tempstring + "    }\n"
+    tempstring = tempstring + "    return 0;\n"
+    tempstring = tempstring + "}\n"
+
+
+    for i in range(len(funclist)): 
+        tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
+        
+        tags_counter = 0;    
+        for arch_list in fcountlist[i]:
+            tempstring = tempstring + "#if LV_HAVE_"
+            for ind in range(len(arch_list)):
+                
+                tempstring = tempstring + arch_list[ind];
+                if ind < len(arch_list) - 1:
+                    tempstring = tempstring + " && LV_HAVE_";
+                
+            tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n";
+            tags_counter = tags_counter + 1;
+                
+        lindex = tempstring.rfind(",");
+        tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", "");
+        tempstring = tempstring + "};\n\n";
+
+        tempstring = tempstring + "static const char* " + funclist[i] + "_indices[] = {\n";
+        
+        tags_counter = 0;    
+        for arch_list in fcountlist[i]:
+            tempstring = tempstring + "#if LV_HAVE_"
+            for ind in range(len(arch_list)):
+                
+                tempstring = tempstring + arch_list[ind];
+                if ind < len(arch_list) - 1:
+                    tempstring = tempstring + " && LV_HAVE_";
+                
+            tempstring = tempstring + "\n \"" + str(taglist[i][tags_counter]) + "\",\n#endif\n";
+            tags_counter = tags_counter + 1;
+                
+        lindex = tempstring.rfind(",");
+        tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", "");
+        tempstring = tempstring + "};\n\n";
+                                                                          
+        tempstring = tempstring + retlist[i] + "inline " + funclist[i] + "_manual" + arched_arglist[i] + '\n';
+        tempstring = tempstring + "return " + funclist[i] + "_archs[volk_get_index(" + funclist[i] + "_indices, arch, " + funclist[i] + "_arch_defs)](" + my_arglist[i] + ");" + "\n}\n";
+    
+        tempstring = tempstring + retlist[i] + "inline " + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n';
+    
+        tempstring = tempstring + funclist[i] + "_archs[" + funclist[i] + "_func_table](" + my_arglist[i] + ");" + '\n';
+        tempstring = tempstring + "}\n\n";                                       
+    
+    return tempstring;
diff --git a/volk/include/volk/make_config_fixed.py b/volk/include/volk/make_config_fixed.py
new file mode 100644
index 000000000..3fd1bdf0a
--- /dev/null
+++ b/volk/include/volk/make_config_fixed.py
@@ -0,0 +1,21 @@
+from xml.dom import minidom
+
+def make_config_fixed(dom) :
+    tempstring = "";
+    tempstring = tempstring +'/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_CONFIG_FIXED_H';
+    tempstring = tempstring + '\n#define INCLUDED_VOLK_CONFIG_FIXED_H';
+    tempstring = tempstring + '\n\n';
+    enum_counter = 0;
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + '#define LV_' + arch.swapcase() + " " + str(enum_counter) + '\n';
+        enum_counter = enum_counter + 1;
+    tempstring = tempstring + '\n\n';
+
+    tempstring = tempstring + "#endif /*INCLUDED_VOLK_CONFIG_FIXED*/\n"
+
+    return tempstring;
+
+
+    
diff --git a/volk/include/volk/make_config_in.py b/volk/include/volk/make_config_in.py
new file mode 100644
index 000000000..d29680af2
--- /dev/null
+++ b/volk/include/volk/make_config_in.py
@@ -0,0 +1,13 @@
+from xml.dom import minidom
+
+def make_config_in(dom) :
+    tempstring = "";
+    tempstring = tempstring +'/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring + '\n\n';
+
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + '#undef LV_HAVE_' + arch.swapcase() + '\n';
+    tempstring = tempstring + '\n';
+
+    return tempstring
diff --git a/volk/include/volk/make_cpuid_generic_c.py b/volk/include/volk/make_cpuid_generic_c.py
new file mode 100644
index 000000000..3ba225fca
--- /dev/null
+++ b/volk/include/volk/make_cpuid_generic_c.py
@@ -0,0 +1,38 @@
+from xml.dom import minidom
+
+def make_cpuid_generic_c(dom) :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring + "#include <volk/volk_cpu.h>\n"
+    tempstring = tempstring + "#include <volk/volk_config_fixed.h>\n"
+    tempstring = tempstring + "\n\n"
+    
+    for domarch in dom:
+        if str(domarch.attributes["type"].value) == "all":
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "    return 1;\n"
+            tempstring = tempstring + "}\n\n"
+    
+        else:
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "  return 0;\n"
+            tempstring = tempstring + "}\n\n"
+
+    tempstring = tempstring + "void volk_cpu_init() {\n";
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n"
+    tempstring = tempstring + "}\n\n"
+
+    tempstring = tempstring + "unsigned int volk_get_lvarch() {\n";
+    tempstring = tempstring + "    unsigned int retval = 0;\n"
+    tempstring = tempstring + "    volk_cpu_init();\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n"
+    tempstring = tempstring + "    return retval;\n"
+    tempstring = tempstring + "}\n\n"
+
+    return tempstring;
diff --git a/volk/include/volk/make_cpuid_h.py b/volk/include/volk/make_cpuid_h.py
new file mode 100644
index 000000000..823e3b2c0
--- /dev/null
+++ b/volk/include/volk/make_cpuid_h.py
@@ -0,0 +1,25 @@
+from xml.dom import minidom
+from emit_omnilog import *
+
+def make_cpuid_h(dom) :
+    tempstring = "";
+    tempstring = tempstring +'/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_CPU_H';
+    tempstring = tempstring +'\n#define INCLUDED_VOLK_CPU_H\n\n';
+    tempstring = tempstring + emit_prolog();
+    tempstring = tempstring + '\n'
+
+    tempstring = tempstring + "struct VOLK_CPU {\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    int (*has_" + arch + ") ();\n";
+    tempstring = tempstring + "}volk_cpu;\n\n";
+
+    tempstring = tempstring + "void volk_cpu_init ();\n"
+    tempstring = tempstring + "unsigned int volk_get_lvarch ();\n"
+
+    tempstring = tempstring + "\n";
+    tempstring = tempstring + emit_epilog();
+    tempstring = tempstring + "#endif /*INCLUDED_VOLK_CPU_H*/\n"
+    
+    return tempstring;
diff --git a/volk/include/volk/make_cpuid_powerpc_c.py b/volk/include/volk/make_cpuid_powerpc_c.py
new file mode 100644
index 000000000..443a58488
--- /dev/null
+++ b/volk/include/volk/make_cpuid_powerpc_c.py
@@ -0,0 +1,45 @@
+from xml.dom import minidom
+
+def make_cpuid_powerpc_c(dom) :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring + "#include <volk/volk_cpu.h>\n"
+    tempstring = tempstring + "#include <volk/volk_config_fixed.h>\n"
+    tempstring = tempstring + "\n\n"
+
+    #just assume it has them for powerpc
+    for domarch in dom:
+        if str(domarch.attributes["type"].value) == "powerpc":
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "    return 1;\n"
+            tempstring = tempstring + "}\n\n"
+        elif str(domarch.attributes["type"].value) == "all":
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "    return 1;\n"
+            tempstring = tempstring + "}\n\n"
+        else:
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "    return 0;\n"
+            tempstring = tempstring + "}\n\n"
+
+
+    tempstring = tempstring + "void volk_cpu_init() {\n";
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n"
+
+    tempstring = tempstring + "}\n\n"
+    tempstring = tempstring + "unsigned int volk_get_lvarch() {\n";
+    tempstring = tempstring + "    unsigned int retval = 0;\n"
+    tempstring = tempstring + "    volk_cpu_init();\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n"
+    tempstring = tempstring + "    return retval;\n"
+    tempstring = tempstring + "}\n\n"
+
+    return tempstring;
+            
diff --git a/volk/include/volk/make_cpuid_x86_c.py b/volk/include/volk/make_cpuid_x86_c.py
new file mode 100644
index 000000000..fd83e5e5c
--- /dev/null
+++ b/volk/include/volk/make_cpuid_x86_c.py
@@ -0,0 +1,93 @@
+from xml.dom import minidom
+
+def make_cpuid_x86_c(dom) :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring + "#include <volk/volk_cpu.h>\n"
+    tempstring = tempstring + "#include <volk/volk_config_fixed.h>\n"
+    tempstring = tempstring + "\n\n"
+    tempstring = tempstring + "extern void cpuid_x86 (unsigned int op, unsigned int result[4]);\n\n"
+    tempstring = tempstring + "static inline unsigned int cpuid_eax(unsigned int op) {\n";
+    tempstring = tempstring + "    unsigned int regs[4];\n"
+    tempstring = tempstring + "    cpuid_x86 (op, regs);\n"
+    tempstring = tempstring + "    return regs[0];\n"
+    tempstring = tempstring + "}\n\n";
+
+    tempstring = tempstring + "static inline unsigned int cpuid_ebx(unsigned int op) {\n";
+    tempstring = tempstring + "    unsigned int regs[4];\n"
+    tempstring = tempstring + "    cpuid_x86 (op, regs);\n"
+    tempstring = tempstring + "    return regs[1];\n"
+    tempstring = tempstring + "}\n\n";
+
+    tempstring = tempstring + "static inline unsigned int cpuid_ecx(unsigned int op) {\n";
+    tempstring = tempstring + "    unsigned int regs[4];\n"
+    tempstring = tempstring + "    cpuid_x86 (op, regs);\n"
+    tempstring = tempstring + "    return regs[2];\n"
+    tempstring = tempstring + "}\n\n";
+    
+    tempstring = tempstring + "static inline unsigned int cpuid_edx(unsigned int op) {\n";
+    tempstring = tempstring + "    unsigned int regs[4];\n"
+    tempstring = tempstring + "    cpuid_x86 (op, regs);\n"
+    tempstring = tempstring + "    return regs[3];\n"
+    tempstring = tempstring + "}\n\n";
+    
+    for domarch in dom:
+        if str(domarch.attributes["type"].value) == "x86":
+            arch = str(domarch.attributes["name"].value);
+            op = domarch.getElementsByTagName("op");
+            op = str(op[0].firstChild.data);
+            reg = domarch.getElementsByTagName("reg");
+            reg = str(reg[0].firstChild.data);
+            shift = domarch.getElementsByTagName("shift");
+            shift = str(shift[0].firstChild.data);
+            val = domarch.getElementsByTagName("val");
+            val = str(val[0].firstChild.data);
+            
+            if op == "1":
+                tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+                tempstring = tempstring + "    unsigned int e" + reg + "x = cpuid_e" + reg + "x (" + op + ");\n"
+                tempstring = tempstring + "    return ((e" + reg + "x >> " + shift + ") & 1) == " + val + ";\n"
+                tempstring = tempstring + "}\n\n";
+
+            elif op == "0x80000001":
+                tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+                tempstring = tempstring + "    unsigned int extended_fct_count = cpuid_eax(0x80000000);\n";
+                tempstring = tempstring + "    if (extended_fct_count < 0x80000001)\n";
+                tempstring = tempstring + "        return "+ val + "^1;\n\n"
+                tempstring = tempstring + "    unsigned int extended_features = cpuid_e" + reg + "x (" + op + ");\n";
+                tempstring = tempstring + "    return ((extended_features >> " + shift + ") & 1) == " + val + ";\n"
+                tempstring = tempstring + "}\n\n";
+        elif str(domarch.attributes["type"].value) == "all":
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "    return 1;\n"
+            tempstring = tempstring + "}\n\n"
+        else:
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "int i_can_has_" + arch + " () {\n"
+            tempstring = tempstring + "    return 0;\n"
+            tempstring = tempstring + "}\n\n"
+    
+    tempstring = tempstring + "void volk_cpu_init() {\n";
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    volk_cpu.has_" + arch + " = &i_can_has_" + arch + ";\n"
+    tempstring = tempstring + "}\n\n"
+
+    tempstring = tempstring + "unsigned int volk_get_lvarch() {\n";
+    tempstring = tempstring + "    unsigned int retval = 0;\n"
+    tempstring = tempstring + "    volk_cpu_init();\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    retval += volk_cpu.has_" + arch + "() << LV_" + arch.swapcase() + ";\n"
+    tempstring = tempstring + "    return retval;\n"
+    tempstring = tempstring + "}\n\n"
+
+    return tempstring;
+
+    
+            
+        
+
+                
+                
diff --git a/volk/include/volk/make_environment_init_c.py b/volk/include/volk/make_environment_init_c.py
new file mode 100644
index 000000000..e06c7f246
--- /dev/null
+++ b/volk/include/volk/make_environment_init_c.py
@@ -0,0 +1,33 @@
+from xml.dom import minidom
+
+def make_environment_init_c(dom) :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring + "#include<volk/volk_environment_init.h>\n"
+    tempstring = tempstring + "#include<volk/volk_config.h>\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        incs = domarch.getElementsByTagName("include");
+        for inc in incs:
+            my_inc = str(inc.firstChild.data);
+            tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+            tempstring = tempstring + "#include<" + my_inc + ">\n";
+            tempstring = tempstring + "#endif\n"
+    tempstring = tempstring + '\n\n';
+    tempstring = tempstring + "void volk_environment_init(){\n"
+    
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        envs = domarch.getElementsByTagName("environment");
+        for env in envs:
+            cmd = str(env.firstChild.data);
+            tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+            tempstring = tempstring + "        " + cmd + "\n";
+            tempstring = tempstring + "#endif\n"     
+    
+    tempstring = tempstring + "}\n";
+    return tempstring;
+            
+
+
+    
diff --git a/volk/include/volk/make_environment_init_h.py b/volk/include/volk/make_environment_init_h.py
new file mode 100644
index 000000000..77a841a24
--- /dev/null
+++ b/volk/include/volk/make_environment_init_h.py
@@ -0,0 +1,18 @@
+from xml.dom import minidom
+from emit_omnilog import *
+
+def make_environment_init_h() :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring + "#ifndef INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n";
+    tempstring = tempstring + "#define INCLUDE_LIBVECTOR_ENVIRONMENT_INIT_H\n";
+    tempstring = tempstring + "\n";
+    tempstring = tempstring + emit_prolog();
+    tempstring = tempstring + "void volk_environment_init();\n";
+    tempstring = tempstring + emit_epilog();
+    tempstring = tempstring + "#endif\n"
+    return tempstring;
+            
+
+
+    
diff --git a/volk/include/volk/make_h.py b/volk/include/volk/make_h.py
new file mode 100644
index 000000000..81d9ad401
--- /dev/null
+++ b/volk/include/volk/make_h.py
@@ -0,0 +1,28 @@
+from xml.dom import minidom
+from emit_omnilog import *
+from volk_regexp import *
+
+
+
+def make_h(funclist, arched_arglist, retlist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_H';
+    tempstring = tempstring + '\n#define INCLUDED_VOLK_H';
+    tempstring = tempstring + '\n\n#include<inttypes.h>\n';
+    tempstring = tempstring + '#include<volk/volk_complex.h>\n';
+    tempstring = tempstring + '#include<volk/volk_config.h>\n';
+    tempstring = tempstring + '#include<volk/volk_config_fixed.h>\n';
+    tempstring = tempstring + '#include<volk/volk_environment_init.h>\n'
+    tempstring = tempstring + emit_prolog()
+    tempstring = tempstring + '\n';               
+
+    for i in range(len(retlist)):
+        tempstring = tempstring + retlist[i] + funclist[i] + replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n';
+        tempstring = tempstring + retlist[i] + funclist[i] + "_manual" + replace_bracket.sub(";", arched_arglist[i]) + '\n';
+
+    tempstring = tempstring + emit_epilog();
+
+    tempstring = tempstring + "#endif /*INCLUDED_VOLK_H*/\n";
+    
+    return tempstring;
diff --git a/volk/include/volk/make_init_c.py b/volk/include/volk/make_init_c.py
new file mode 100644
index 000000000..330e19592
--- /dev/null
+++ b/volk/include/volk/make_init_c.py
@@ -0,0 +1,42 @@
+from xml.dom import minidom
+
+def make_init_c(funclist, dom) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+    tempstring = tempstring + '\n\n#include<volk/volk_runtime.h>\n';
+    tempstring = tempstring + '#include<volk/volk_cpu.h>\n';
+    tempstring = tempstring + '#include<volk_init.h>\n';
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        incs = domarch.getElementsByTagName("include");
+        for inc in incs:
+            my_inc = str(inc.firstChild.data);
+            tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+            tempstring = tempstring + "#include<" + my_inc + ">\n";
+            tempstring = tempstring + "#endif\n"
+    tempstring = tempstring + '\n\n';
+
+    tempstring = tempstring + "extern struct VOLK_RUNTIME volk_runtime;\n\n";
+    tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime(){\n";
+    tempstring = tempstring + "    return &volk_runtime;\n";
+    tempstring = tempstring + "}\n\n"
+    tempstring = tempstring + "    void volk_runtime_init() {\nvolk_cpu_init();\n";
+
+    for func in funclist:
+        tempstring = tempstring + "    volk_runtime." + func + " = default_acquire_" + func + ";\n";
+
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        envs = domarch.getElementsByTagName("environment");
+        for env in envs:
+            cmd = str(env.firstChild.data);
+            tempstring = tempstring + "    if(volk_cpu.has_" + arch + "()){\n";
+            tempstring = tempstring + "#if LV_HAVE_" + arch.swapcase() + "\n";
+            tempstring = tempstring + "        " + cmd + "\n";
+            tempstring = tempstring + "#endif\n"     
+            tempstring = tempstring + "    }\n";
+            
+    tempstring = tempstring + "}\n";
+
+    return tempstring
diff --git a/volk/include/volk/make_init_h.py b/volk/include/volk/make_init_h.py
new file mode 100644
index 000000000..6dbe1c585
--- /dev/null
+++ b/volk/include/volk/make_init_h.py
@@ -0,0 +1,26 @@
+from xml.dom import minidom
+from emit_omnilog import *
+from volk_regexp import *
+
+
+
+def make_init_h(funclist, arched_arglist, retlist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+    tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_INIT_H';
+    tempstring = tempstring + '\n#define INCLUDED_VOLK_INIT_H';
+    tempstring = tempstring + '\n\n#include<inttypes.h>\n';
+    tempstring = tempstring + '#include<volk/volk_complex.h>\n';
+
+    tempstring = tempstring + '\n';
+
+    tempstring = tempstring + emit_prolog();
+    
+    for i in range(len(retlist)):
+        tempstring = tempstring + retlist[i] + " default_acquire_" + funclist[i] +  replace_bracket.sub(";", replace_arch.sub("", arched_arglist[i])) + '\n';
+
+    tempstring= tempstring + emit_epilog();
+    tempstring = tempstring + "#endif /*INCLUDED_VOLK_INIT_H*/\n";
+    
+    return tempstring;
diff --git a/volk/include/volk/make_mktables.py b/volk/include/volk/make_mktables.py
new file mode 100644
index 000000000..051ac268d
--- /dev/null
+++ b/volk/include/volk/make_mktables.py
@@ -0,0 +1,33 @@
+
+
+def make_mktables(funclist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n';
+
+    tempstring = tempstring + '#include<stdio.h>\n';
+    tempstring = tempstring + '#include<volk/volk_registry.h>\n';
+    tempstring = tempstring + '#include<volk_rank_archs.h>\n';
+    tempstrgin = tempstring + '#include<volk/volk_cpu.h>\n';
+    tempstring = tempstring + "\n\n";
+
+    tempstring = tempstring + 'int main() {\n';
+    tempstring = tempstring + '  int i = 0;\n';
+    tempstring = tempstring + '  FILE* output;\n';
+    tempstring = tempstring + '  output = fopen("volk_tables.h", "w");\n';
+    tempstring = tempstring + '  fprintf(output, "#ifndef INCLUDED_VOLK_TABLES_H\\n");\n';
+    tempstring = tempstring + '  fprintf(output, "#define INCLUDED_VOLK_TABLES_H\\n\\n");\n';
+    
+    for func in funclist:
+        tempstring = tempstring + '  fprintf(output, "static const ' + func + '_func_table = %u;\\n", volk_rank_archs(' + func + '_arch_defs, volk_get_lvarch()));\n';
+    tempstring = tempstring + '  fprintf(output, "#endif /*INCLUDED_VOLK_TABLES_H*/\\n");\n';
+    tempstring = tempstring + '  fclose(output);\n'
+    tempstring = tempstring + '}\n';
+    return tempstring;
+        
+
+    
+
+    
+
+
+
diff --git a/volk/include/volk/make_proccpu_sim.py b/volk/include/volk/make_proccpu_sim.py
new file mode 100644
index 000000000..c75a4d5fb
--- /dev/null
+++ b/volk/include/volk/make_proccpu_sim.py
@@ -0,0 +1,25 @@
+from xml.dom import minidom
+
+def make_proccpu_sim(dom) :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring + "#include <volk/volk_cpu.h>\n"
+    tempstring = tempstring + "#include <stdio.h>\n"
+    tempstring = tempstring + "\n\n"
+
+    tempstring = tempstring + "void test_append(char* buf,  int val, char* newkey){\n";
+    tempstring = tempstring + "    if(val==1){\n";
+    tempstring = tempstring + "        sprintf(buf, \"%s %s\", buf, newkey);\n";
+    tempstring = tempstring + "    }\n";
+    tempstring = tempstring + "}\n";
+    tempstring = tempstring + "\n\n";
+    
+    tempstring = tempstring + "int main() {\n";
+    tempstring = tempstring + "    volk_cpu_init();\n";
+    tempstring = tempstring + "    char buf[2048];\n";
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + "    test_append(buf, volk_cpu.has_" + arch + "(), \"" + arch + "\");\n"
+    tempstring = tempstring + "    printf(\"%s\\n\", buf);\n"
+    tempstring = tempstring + "}\n"
+    return tempstring;
diff --git a/volk/include/volk/make_registry.py b/volk/include/volk/make_registry.py
new file mode 100644
index 000000000..8457d61f3
--- /dev/null
+++ b/volk/include/volk/make_registry.py
@@ -0,0 +1,62 @@
+from xml.dom import minidom
+from emit_omnilog import *
+import string
+
+def make_registry(dom, funclist, fcountlist) :
+    tempstring = "";
+    tempstring = tempstring + "/*this file is auto_generated by volk_register.py*/\n\n";
+    tempstring = tempstring +'\n#ifndef INCLUDED_VOLK_REGISTRY_H';
+    tempstring = tempstring +'\n#define INCLUDED_VOLK_REGISTRY_H\n\n';
+    tempstring = tempstring +'#include<volk/volk_config.h>\n';
+    tempstring = tempstring +'#include<volk/volk_config_fixed.h>\n';
+    tempstring = tempstring + emit_prolog();
+    tempstring = tempstring + '\n'
+
+    
+
+    
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring +"#if LV_HAVE_" + arch.swapcase() + "\n";
+        tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 1\n";
+        tempstring = tempstring +"#else\n";
+        tempstring = tempstring +"#define LV_" + arch.swapcase() + "_CNT 0\n";
+        tempstring = tempstring +"#endif /*LV_HAVE_" + arch.swapcase() + "*/\n\n";
+
+    counter = 0;
+    for fcount in fcountlist:
+        tempstring = tempstring + "static const int " + funclist[counter] + "_arch_defs[] = {\n";
+        counter = counter + 1;
+        for arch_list in fcount:
+            tempstring = tempstring + " (LV_"
+            for ind in range(len(arch_list)): 
+                tempstring = tempstring + arch_list[ind] + "_CNT";
+                if ind < len(arch_list) - 1:
+                    tempstring = tempstring + " * LV_";
+            tempstring = tempstring + ") + ";
+        lindex = tempstring.rfind(" + ");
+        tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], " + ", "");
+        tempstring = tempstring + ",\n"
+        for arch_list in fcount:
+            tempstring = tempstring + "#if LV_HAVE_"
+            for ind in range(len(arch_list)): 
+                tempstring = tempstring + arch_list[ind];
+                if ind < len(arch_list) - 1:
+                    tempstring = tempstring + " && LV_HAVE_";
+            tempstring = tempstring + "\n"
+            tempstring = tempstring + " (1 << LV_"
+            for ind in range(len(arch_list)):
+                tempstring = tempstring + arch_list[ind];
+                if ind < len(arch_list) - 1:
+                    tempstring = tempstring + ") + (1 << LV_"
+            tempstring = tempstring + "),\n#endif\n"
+        lindex = tempstring.rfind(",");
+        tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", "");
+        tempstring = tempstring + "};\n\n"
+
+    
+    tempstring = tempstring + emit_epilog();
+    tempstring = tempstring +"#endif /*INCLUDED_VOLK_REGISTRY_H*/\n";
+
+    return tempstring;
+    
diff --git a/volk/include/volk/make_runtime.py b/volk/include/volk/make_runtime.py
new file mode 100644
index 000000000..645b3aaee
--- /dev/null
+++ b/volk/include/volk/make_runtime.py
@@ -0,0 +1,34 @@
+from xml.dom import minidom
+from emit_omnilog import *
+from volk_regexp import *
+
+
+
+def make_runtime(funclist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/\n';
+
+    tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_RUNTIME';
+    tempstring = tempstring + '\n#define INCLUDED_VOLK_RUNTIME';
+    tempstring = tempstring + '\n\n#include<volk/volk_typedefs.h>\n';
+    tempstring = tempstring + '#include<volk/volk_config.h>\n';
+    tempstring = tempstring + '#include<volk/volk_config_fixed.h>\n';
+    tempstring = tempstring + '#include<volk/volk_complex.h>\n';
+    tempstring = tempstring + emit_prolog();
+
+    tempstring = tempstring + '\n';
+
+    tempstring = tempstring + "struct VOLK_RUNTIME {\n";
+
+    for i in range(len(funclist)):
+        tempstring = tempstring + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + ";\n";
+    tempstring = tempstring + "};\n\n";
+
+    tempstring = tempstring + "struct VOLK_RUNTIME* get_volk_runtime();\n\n"
+    tempstring = tempstring + "\nvoid volk_runtime_init();\n";
+
+    tempstring = tempstring + emit_epilog();
+    tempstring = tempstring + "#endif /*INCLUDED_VOLK_RUNTIME*/\n";
+
+    return tempstring;
+    
diff --git a/volk/include/volk/make_runtime_c.py b/volk/include/volk/make_runtime_c.py
new file mode 100644
index 000000000..070df9ba7
--- /dev/null
+++ b/volk/include/volk/make_runtime_c.py
@@ -0,0 +1,47 @@
+from xml.dom import minidom
+import string
+from volk_regexp import *
+
+
+def make_runtime_c(funclist, taglist, arched_arglist, retlist, my_arglist, fcountlist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+
+    
+    tempstring = tempstring + '\n\n#include<volk/volk_runtime.h>\n';
+    tempstring = tempstring + '#include<volk/volk_config.h>\n';
+    tempstring = tempstring + "#include<volk/volk_config_fixed.h>\n";
+    tempstring = tempstring + '#include<volk/volk_cpu.h>\n';
+    tempstring = tempstring + '#include<volk_init.h>\n';
+    tempstring = tempstring + '#include<volk/volk_registry.h>\n';
+
+    for func in funclist:
+        tempstring = tempstring + "#include<volk/" + func + ".h>\n" ;
+    tempstring = tempstring + '\n';
+
+    tempstring = tempstring + "struct VOLK_RUNTIME volk_runtime;\n";
+    
+    for i in range(len(funclist)): 
+        tempstring = tempstring + "static const " + replace_volk.sub("p", funclist[i]) + " " + funclist[i] + "_archs[] = {\n";
+        
+        tags_counter = 0;    
+        for arch_list in fcountlist[i]:
+            tempstring = tempstring + "#if LV_HAVE_"
+            for ind in range(len(arch_list)):
+                
+                tempstring = tempstring + arch_list[ind];
+                if ind < len(arch_list) - 1:
+                    tempstring = tempstring + " && LV_HAVE_";
+                
+            tempstring = tempstring + "\n " + funclist[i] + "_" + str(taglist[i][tags_counter]) + ",\n#endif\n";
+            tags_counter = tags_counter + 1;
+                
+        lindex = tempstring.rfind(",");
+        tempstring = tempstring[0:lindex] + string.replace(tempstring[lindex:len(tempstring)], ",", "");
+        tempstring = tempstring + "};\n\n";
+
+
+        tempstring = tempstring + retlist[i] + "default_acquire_" + funclist[i] + replace_arch.sub("", arched_arglist[i]) + '\n';
+        tempstring = tempstring + "volk_runtime." + funclist[i] + " = " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())];\n" + "return " + funclist[i] + "_archs[volk_rank_archs(" + funclist[i] + "_arch_defs, volk_get_lvarch())](" + my_arglist[i] + ");" + '\n}\n';
+    
+    return tempstring;
diff --git a/volk/include/volk/make_set_simd.py b/volk/include/volk/make_set_simd.py
new file mode 100644
index 000000000..e320bc748
--- /dev/null
+++ b/volk/include/volk/make_set_simd.py
@@ -0,0 +1,202 @@
+from xml.dom import minidom
+
+def make_set_simd(dom) :
+    tempstring = "";
+    tempstring = tempstring +'dnl this file is auto generated by volk_register.py\n\n';
+
+    tempstring = tempstring + "AC_DEFUN([_MAKE_FAKE_PROCCPU],\n";
+    tempstring = tempstring + "[\n";
+    tempstring = tempstring + "  AC_REQUIRE([GR_SET_MD_CPU])\n";
+    tempstring = tempstring + "  AC_MSG_CHECKING([proccpu])\n";
+    tempstring = tempstring + "  case \"$MD_CPU\" in\n";
+    tempstring = tempstring + "  (x86)\n";
+    tempstring = tempstring + "    case \"$MD_SUBCPU\" in\n";
+    tempstring = tempstring + "    (x86)\n";
+    tempstring = tempstring + "      if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86.S 2>&1`\"\n";
+    tempstring = tempstring + "        then\n";
+    tempstring = tempstring + "          AC_MSG_RESULT(yes)\n";
+    tempstring = tempstring + "          lv_PROCCPU=\"`./proccpu`\"\n";
+    tempstring = tempstring + "          rm -f proccpu\n";
+    tempstring = tempstring + "        else\n";
+    tempstring = tempstring + "          AC_MSG_RESULT(no)\n";
+    tempstring = tempstring + "          lv_PROCCPU=no\n";
+    tempstring = tempstring + "      fi\n"
+    tempstring = tempstring + "    ;;\n"
+    tempstring = tempstring + "    (*)\n"
+    tempstring = tempstring + "      if test -z \"`${CC} -o proccpu -I ./include/ -I./lib lib/volk_proccpu_sim.c lib/volk_cpu_x86.c lib/cpuid_x86_64.S 2>&1`\"\n";
+    tempstring = tempstring + "        then\n";
+    tempstring = tempstring + "          AC_MSG_RESULT(yes)\n";
+    tempstring = tempstring + "          lv_PROCCPU=\"`./proccpu`\"\n";
+    tempstring = tempstring + "          rm -f proccpu\n";
+    tempstring = tempstring + "        else\n";
+    tempstring = tempstring + "          AC_MSG_RESULT(no)\n";
+    tempstring = tempstring + "          lv_PROCCPU=no\n";
+    tempstring = tempstring + "      fi\n"
+    tempstring = tempstring + "    ;;\n"
+    tempstring = tempstring + "    esac\n"
+    tempstring = tempstring + "  ;;\n";
+    tempstring = tempstring + "  (powerpc)\n";
+    tempstring = tempstring + "    if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_powerpc.c 2>&1`\"\n";
+    tempstring = tempstring + "      then\n";
+    tempstring = tempstring + "        AC_MSG_RESULT(yes)\n";
+    tempstring = tempstring + "        lv_PROCCPU=\"`./proccpu`\"\n";
+    tempstring = tempstring + "        rm -f proccpu\n";
+    tempstring = tempstring + "      else\n";
+    tempstring = tempstring + "        AC_MSG_RESULT(no)\n";
+    tempstring = tempstring + "        lv_PROCCPU=no\n";
+    tempstring = tempstring + "    fi\n"
+    tempstring = tempstring + "  ;;\n";
+    tempstring = tempstring + "  (*)\n";
+    tempstring = tempstring + "    if test -z \"`${CC} -o proccpu -I ./include/ lib/volk_proccpu_sim.c lib/volk_cpu_generic.c 2>&1`\"\n";
+    tempstring = tempstring + "      then\n";
+    tempstring = tempstring + "        AC_MSG_RESULT(yes)\n";
+    tempstring = tempstring + "        lv_PROCCPU=\"`./proccpu`\"\n";
+    tempstring = tempstring + "        rm -f proccpu\n";
+    tempstring = tempstring + "      else\n";
+    tempstring = tempstring + "        AC_MSG_RESULT(no)\n";
+    tempstring = tempstring + "        lv_PROCCPU=no\n";
+    tempstring = tempstring + "    fi\n"
+    tempstring = tempstring + "  ;;\n";
+    tempstring = tempstring + "  esac\n";
+    tempstring = tempstring + "])\n"
+
+    for domarch in dom:
+        if str(domarch.attributes["type"].value) != "all":
+            arch = str(domarch.attributes["name"].value);
+            flag = domarch.getElementsByTagName("flag");
+            flag = str(flag[0].firstChild.data);
+            tempstring = tempstring + "AC_DEFUN([_TRY_ADD_" + arch.swapcase() + "],\n";
+            tempstring = tempstring + "[\n";
+            tempstring = tempstring + "  LF_CHECK_CC_FLAG([-" + flag + "])\n";
+            tempstring = tempstring + "  LF_CHECK_CXX_FLAG([-" + flag + "])\n";
+            tempstring = tempstring + "])\n";
+
+    tempstring = tempstring + "AC_DEFUN([LV_SET_SIMD_FLAGS],\n";
+    tempstring = tempstring + "[\n";
+    tempstring = tempstring + "  AC_REQUIRE([GR_SET_MD_CPU])\n";
+    tempstring = tempstring + "  AC_SUBST(LV_CXXFLAGS)\n";
+    tempstring = tempstring + "  indCC=no\n";
+    tempstring = tempstring + "  indCXX=no\n";
+    tempstring = tempstring + "  indLV_ARCH=no\n";
+    tempstring = tempstring + "  AC_ARG_WITH(lv_arch,\n";
+    tempstring = tempstring + "    AC_HELP_STRING([--with-lv_arch=ARCH],[set volk hardware speedups as space separated string with elements from the following list(";
+    
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        tempstring = tempstring + arch + ", "
+    tempstring = tempstring[0:len(tempstring) - 2];
+        
+    tempstring = tempstring + ")]),\n";
+    tempstring = tempstring + "      [cf_with_lv_arch=\"$withval\"],\n";
+    tempstring = tempstring + "      [cf_with_lv_arch=\"\"])\n";
+    if str(domarch.attributes["type"].value) == "all":
+        arch = str(domarch.attributes["name"].value);    
+        tempstring = tempstring + "  AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [always set "+ arch + "!])\n";
+    tempstring = tempstring + "  ADDONS=\"\"\n";
+    tempstring = tempstring + "  _MAKE_FAKE_PROCCPU\n";
+    tempstring = tempstring + "  if test -z \"$cf_with_lv_arch\"; then\n";
+    tempstring = tempstring + "    cf_with_lv_arch=$lv_PROCCPU\n";
+    
+    tempstring = tempstring + "  fi\n";
+    tempstring = tempstring + "  echo $cf_with_lv_arch\n";
+    for domarch in dom:
+        if str(domarch.attributes["type"].value) != "all":
+            arch = str(domarch.attributes["name"].value);
+            tempstring = tempstring + "  LV_HAVE_" + arch.swapcase() + "=no\n";
+
+    tempstring = tempstring + "  case \"$MD_CPU\" in\n";
+    tempstring = tempstring + "  (x86)\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        atype = str(domarch.attributes["type"].value);
+        if atype == "x86":
+            tempstring = tempstring + "    _TRY_ADD_" + arch.swapcase() + "\n";
+
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        atype = str(domarch.attributes["type"].value);
+        flag = domarch.getElementsByTagName("flag");
+        flag = str(flag[0].firstChild.data);
+        if atype == "x86":
+            tempstring = tempstring + "    for i in $lf_CXXFLAGS\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X-" + flag +"; then\n";
+            tempstring = tempstring + "        indCXX=yes\n";
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    for i in $lf_CFLAGS\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X-" + flag +"; then\n";
+            tempstring = tempstring + "        indCC=yes\n";
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    for i in $cf_with_lv_arch\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X" + arch + "; then\n";
+            tempstring = tempstring + "        indLV_ARCH=yes\n"
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    if test \"$indCC\" == \"yes\" && test \"$indCXX\" == \"yes\" && test \"$indLV_ARCH\" == \"yes\"; then\n"
+            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+            tempstring = tempstring + "      ADDONS=\"${ADDONS} -" + flag + "\"\n";
+            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    indCC=no\n"
+            tempstring = tempstring + "    indCXX=no\n"
+            tempstring = tempstring + "    indLV_ARCH=no\n"
+        elif atype == "all":
+            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+    tempstring = tempstring + "  ;;\n"
+        
+    tempstring = tempstring + "  (powerpc)\n"
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        atype = str(domarch.attributes["type"].value);
+        if atype == "powerpc":
+            tempstring = tempstring + "    _TRY_ADD_" + arch.swapcase() + "\n";
+
+    for domarch in dom:
+        arch = str(domarch.attributes["name"].value);
+        atype = str(domarch.attributes["type"].value);
+        flag = domarch.getElementsByTagName("flag");
+        flag = str(flag[0].firstChild.data);
+        if atype == "powerpc":
+            tempstring = tempstring + "    for i in $lf_CXXFLAGS\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X-" + flag +"; then\n";
+            tempstring = tempstring + "        indCXX=yes\n";
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    for i in $lf_CFLAGS\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X-" + flag +"; then\n";
+            tempstring = tempstring + "        indCC=yes\n";
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    for i in $cf_with_lv_arch\n"
+            tempstring = tempstring + "    do\n"
+            tempstring = tempstring + "      if test \"X$i\" = X" + arch + "; then\n";
+            tempstring = tempstring + "        indLV_ARCH=yes\n"
+            tempstring = tempstring + "      fi\n"
+            tempstring = tempstring + "    done\n"
+            tempstring = tempstring + "    if test \"$indCC\" = yes && test \"indCXX\" = yes && \"indLV_ARCH\" = yes; then\n"
+            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+            tempstring = tempstring + "      ADDONS=\"${ADDONS} -" + flag + "\"\n";
+            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+            tempstring = tempstring + "    fi\n"
+            tempstring = tempstring + "    indCC=no\n"
+            tempstring = tempstring + "    indCXX=no\n"
+            tempstring = tempstring + "    indLV_ARCH=no\n"
+        elif atype == "all":
+            tempstring = tempstring + "      AC_DEFINE(LV_HAVE_" + arch.swapcase() + ", 1, [" + arch + " flag set])\n";
+            tempstring = tempstring + "      LV_HAVE_" + arch.swapcase() + "=yes\n";
+    tempstring = tempstring + "  ;;\n"
+    tempstring = tempstring + "  esac\n"
+    tempstring = tempstring + "  LV_CXXFLAGS=\"${LV_CXXFLAGS} ${ADDONS}\"\n"
+    tempstring = tempstring + "])\n"
+   
+    return tempstring;
+                
+                
+        
diff --git a/volk/include/volk/make_typedefs.py b/volk/include/volk/make_typedefs.py
new file mode 100644
index 000000000..fe81cb2b0
--- /dev/null
+++ b/volk/include/volk/make_typedefs.py
@@ -0,0 +1,23 @@
+from xml.dom import minidom
+import string
+from volk_regexp import *
+
+
+
+def make_typedefs(funclist, retlist, my_argtypelist) :
+    tempstring = "";
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring + '/*this file is auto generated by volk_register.py*/';
+    tempstring = tempstring + '\n#ifndef INCLUDED_VOLK_TYPEDEFS';
+    tempstring = tempstring + '\n#define INCLUDED_VOLK_TYPEDEFS\n';
+    tempstring = tempstring + '\n\n#include<inttypes.h>\n';
+    tempstring = tempstring + '#include<volk/volk_complex.h>\n';
+
+    tempstring = tempstring + '\n';
+
+    for i in range(len(funclist)): 
+        tempstring = tempstring + "typedef " + retlist[i] +" (*" + replace_volk.sub("p", funclist[i]) + ")(" + my_argtypelist[i] + ");\n\n";
+
+    tempstring = tempstring + "#endif /*INCLUDED_VOLK_TYPEDEFS*/\n";
+
+    return tempstring;
diff --git a/volk/include/volk/volk_16s_add_quad_aligned16.h b/volk/include/volk/volk_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..63042bef1
--- /dev/null
+++ b/volk/include/volk/volk_16s_add_quad_aligned16.h
@@ -0,0 +1,136 @@
+#ifndef INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16s_add_quad_aligned16_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  __m128i *p_target0, *p_target1, *p_target2, *p_target3,  *p_src0, *p_src1, *p_src2, *p_src3, *p_src4;
+  p_target0 = (__m128i*)target0;
+  p_target1 = (__m128i*)target1;
+  p_target2 = (__m128i*)target2;
+  p_target3 = (__m128i*)target3;
+
+  p_src0 = (__m128i*)src0;
+  p_src1 = (__m128i*)src1;
+  p_src2 = (__m128i*)src2;
+  p_src3 = (__m128i*)src3;
+  p_src4 = (__m128i*)src4;
+
+  int i = 0;
+
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(p_src1);
+    xmm2 = _mm_load_si128(p_src2);
+    xmm3 = _mm_load_si128(p_src3);
+    xmm4 = _mm_load_si128(p_src4);
+    
+    p_src0 += 1;
+    p_src1 += 1;
+    
+    xmm1 = _mm_add_epi16(xmm0, xmm1);
+    xmm2 = _mm_add_epi16(xmm0, xmm2);
+    xmm3 = _mm_add_epi16(xmm0, xmm3);
+    xmm4 = _mm_add_epi16(xmm0, xmm4);
+    
+    
+    p_src2 += 1;
+    p_src3 += 1;
+    p_src4 += 1;
+
+    _mm_store_si128(p_target0, xmm1);
+    _mm_store_si128(p_target1, xmm2);
+    _mm_store_si128(p_target2, xmm3);
+    _mm_store_si128(p_target3, xmm4);
+    
+    p_target0 += 1;
+    p_target1 += 1;
+    p_target2 += 1;
+    p_target3 += 1;
+  }
+    /*asm volatile
+		(
+		 ".%=volk_16s_add_quad_aligned16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je .%=volk_16s_add_quad_aligned16_sse2_END\n\t"
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+		 "movaps (%[src4]), %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $16, %[src1]\n\t"
+		 "add $16, %[src2]\n\t"
+		 "add $16, %[src3]\n\t"
+		 "add $16, %[src4]\n\t"
+		 "paddw %%xmm1, %%xmm2\n\t"
+		 "paddw %%xmm1, %%xmm3\n\t"
+		 "paddw %%xmm1, %%xmm4\n\t"
+		 "paddw %%xmm1, %%xmm5\n\t"
+		 "add $-1, %[bound]\n\t"
+		 "movaps %%xmm2, (%[target0])\n\t"
+		 "movaps %%xmm3, (%[target1])\n\t"
+		 "movaps %%xmm4, (%[target2])\n\t"
+		 "movaps %%xmm5, (%[target3])\n\t"
+		 "add $16, %[target0]\n\t"
+		 "add $16, %[target1]\n\t"
+		 "add $16, %[target2]\n\t"
+		 "add $16, %[target3]\n\t"
+		 "jmp .%=volk_16s_add_quad_aligned16_sse2_L1\n\t"
+		 ".%=volk_16s_add_quad_aligned16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3)
+		 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+		 );
+		 
+    */
+	 
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target0[i] = src0[i] + src1[i];
+    target1[i] = src0[i] + src2[i];
+    target2[i] = src0[i] + src3[i];
+    target3[i] = src0[i] + src4[i];
+  }
+}
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_add_quad_aligned16_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target0[i] = src0[i] + src1[i];
+		target1[i] = src0[i] + src2[i];
+		target2[i] = src0[i] + src3[i];
+		target3[i] = src0[i] + src4[i];
+	}
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+
+#endif /*INCLUDED_VOLK_16s_ADD_QUAD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..fb9d7cb87
--- /dev/null
+++ b/volk/include/volk/volk_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,194 @@
+#ifndef INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16s_branch_4_state_8_aligned16_ssse3(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	
+  
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+
+  __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+  
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = 1;
+  
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+  xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+  xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+  xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+  xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+  for(; i < bound; ++i) {
+    
+    xmm5 = _mm_load_si128(p_src0);
+    
+    
+    
+    
+    
+    
+    
+
+
+    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+    
+    p_src0 += 4;
+   
+    
+    xmm5 = _mm_add_epi16(xmm1, xmm2);
+    
+    xmm6 = _mm_add_epi16(xmm2, xmm6);
+    xmm8 = _mm_add_epi16(xmm1, xmm8);
+   
+     
+    xmm7 = _mm_load_si128(p_cntl2);
+    xmm9 = _mm_load_si128(p_cntl3);
+    
+    xmm0 = _mm_add_epi16(xmm5, xmm0);
+    
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm5 = _mm_load_si128(&p_cntl2[1]);
+    xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm7);
+   
+   
+ 
+    xmm7 = _mm_load_si128(&p_cntl2[2]);
+    xmm9 = _mm_load_si128(&p_cntl3[2]);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm5);
+   
+    
+    xmm5 = _mm_load_si128(&p_cntl2[3]);
+    xmm11 = _mm_load_si128(&p_cntl3[3]);
+    
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+    
+    xmm8 = _mm_add_epi16(xmm8, xmm7);
+    
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+    
+    _mm_store_si128(p_target, xmm0);
+    _mm_store_si128(&p_target[1], xmm6);
+
+    xmm10 = _mm_add_epi16(xmm5, xmm10);
+    
+    _mm_store_si128(&p_target[2], xmm8);
+    
+    _mm_store_si128(&p_target[3], xmm10);
+    
+    p_target += 3;   
+  }
+}
+	
+	
+#endif /*LV_HAVE_SSEs*/
+
+#if LV_HAVE_GENERIC
+static inline  void volk_16s_branch_4_state_8_aligned16_generic(short* target,  short* src0, char** permuters, short* cntl2, short* cntl3, short* scalars) {
+	int i = 0;
+	
+	int bound = 4;
+	
+	for(; i < bound; ++i) {
+	  target[i* 8] = src0[((char)permuters[i][0])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8] & scalars[2])
+	    + (cntl3[i * 8] & scalars[3]);
+	  target[i* 8 + 1] = src0[((char)permuters[i][1 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 1] & scalars[2])
+	    + (cntl3[i * 8 + 1] & scalars[3]);
+	  target[i* 8 + 2] = src0[((char)permuters[i][2 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 2] & scalars[2])
+	    + (cntl3[i * 8 + 2] & scalars[3]);
+	  target[i* 8 + 3] = src0[((char)permuters[i][3 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 3] & scalars[2])
+	    + (cntl3[i * 8 + 3] & scalars[3]);
+	  target[i* 8 + 4] = src0[((char)permuters[i][4 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 4] & scalars[2])
+	    + (cntl3[i * 8 + 4] & scalars[3]);
+	  target[i* 8 + 5] = src0[((char)permuters[i][5 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 5] & scalars[2])
+	    + (cntl3[i * 8 + 5] & scalars[3]);
+	  target[i* 8 + 6] = src0[((char)permuters[i][6 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 6] & scalars[2])
+	    + (cntl3[i * 8 + 6] & scalars[3]);
+	  target[i* 8 + 7] = src0[((char)permuters[i][7 * 2])/2] 
+	    + ((i + 1)%2  * scalars[0])
+	    + (((i >> 1)^1) * scalars[1])
+	    + (cntl2[i * 8 + 7] & scalars[2])
+	    + (cntl3[i * 8 + 7] & scalars[3]);
+	  
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_16s_BRANCH_4_STATE_8_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_convert_32f_aligned16.h b/volk/include/volk/volk_16s_convert_32f_aligned16.h
new file mode 100644
index 000000000..126ce1528
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_32f_aligned16.h
@@ -0,0 +1,119 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16s_convert_32f_aligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16s_convert_32f_aligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_16s_convert_32f_aligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_32f_unaligned16.h b/volk/include/volk/volk_16s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..d6212fba5
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_32f_unaligned16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16s_convert_32f_unaligned16_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    
+     float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for(;number < eighthPoints; number++){
+
+      // Load the 8 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      // Shift the input data to the right by 64 bits ( 8 bytes )
+      inputVal2 = _mm_srli_si128(inputVal, 8);
+
+      // Convert the lower 4 values into 32 bit words
+      inputVal = _mm_cvtepi16_epi32(inputVal);
+      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+      
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      ret = _mm_cvtepi32_ps(inputVal2);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+
+      inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16s_convert_32f_unaligned16_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
+      
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      inputPtr += 4;
+      outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16s_convert_32f_unaligned16_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_aligned16.h b/volk/include/volk/volk_16s_convert_8s_aligned16.h
new file mode 100644
index 000000000..64c368688
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_aligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_load_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+*/
+static inline void volk_16s_convert_8s_aligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_convert_8s_unaligned16.h b/volk/include/volk/volk_16s_convert_8s_unaligned16.h
new file mode 100644
index 000000000..ca925de86
--- /dev/null
+++ b/volk/include/volk/volk_16s_convert_8s_unaligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
+#define INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_unaligned16_sse2(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+     int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+
+      // Load the 16 values
+      inputVal1 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+      inputVal2 = _mm_loadu_si128((__m128i*)inputPtr); inputPtr += 8;
+
+      inputVal1 = _mm_srai_epi16(inputVal1, 8);
+      inputVal2 = _mm_srai_epi16(inputVal2, 8);
+      
+      ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+      _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+      outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] =(int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the input 16 bit integer data into 8 bit integer data
+  \param inputVector The 16 bit input data buffer
+  \param outputVector The 8 bit output data buffer
+  \param num_points The number of data values to be converted
+  \note Input and output buffers do NOT need to be properly aligned
+*/
+static inline void volk_16s_convert_8s_unaligned16_generic(int8_t* outputVector, const int16_t* inputVector, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const int16_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  >> 8));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16s_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_16s_max_star_aligned16.h b/volk/include/volk/volk_16s_max_star_aligned16.h
new file mode 100644
index 000000000..ba4e979ec
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_aligned16.h
@@ -0,0 +1,108 @@
+#ifndef INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
+#define INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16s_max_star_aligned16_ssse3(short* target, short* src0, unsigned int num_bytes) {
+
+
+  
+  short candidate = src0[0];
+  short cands[8];
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+  
+
+  __m128i *p_src0;
+  
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  xmm1 = _mm_setzero_si128();
+  xmm0 = _mm_setzero_si128();
+  //_mm_insert_epi16(xmm0, candidate, 0);
+  
+  xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 
+
+  
+  for(i = 0; i < bound; ++i) {
+    xmm1 = _mm_load_si128(p_src0);
+    p_src0 += 1;
+    xmm2 = _mm_sub_epi16(xmm1, xmm0);
+  
+
+    
+  
+  
+  
+    xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+    xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+    xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+    xmm6 = _mm_xor_si128(xmm4, xmm5);
+    
+    xmm3 = _mm_and_si128(xmm3, xmm0);
+    xmm4 = _mm_and_si128(xmm6, xmm1);
+    
+    xmm0 = _mm_add_epi16(xmm3, xmm4);
+    
+  
+  }
+  
+  _mm_store_si128((__m128i*)cands, xmm0);
+  
+  for(i = 0; i < 8; ++i) {
+    candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+  }
+  
+ 
+  
+  for(i = 0; i < leftovers; ++i) {
+  
+    candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) ? candidate : src0[(bound << 3) + i];
+  }
+
+  target[0] = candidate;
+  
+    
+    
+ 
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_16s_max_star_aligned16_generic(short* target, short* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	short candidate = src0[0];
+	for(i = 1; i < bound; ++i) {
+	  candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+	}
+	target[0] = candidate;
+	  
+}
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_16s_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
new file mode 100644
index 000000000..82d011677
--- /dev/null
+++ b/volk/include/volk/volk_16s_max_star_horizontal_aligned16.h
@@ -0,0 +1,130 @@
+#ifndef INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
+#define INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+#if LV_HAVE_SSSE3
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+#include<tmmintrin.h>
+
+static inline  void volk_16s_max_star_horizontal_aligned16_ssse3(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+
+  const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
+  const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+  const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
+
+  
+  
+  volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 
+  __m128i  xmm5, xmm6, xmm7, xmm8;
+  
+  xmm4 = _mm_load_si128((__m128i*)shufmask0);
+  xmm5 = _mm_load_si128((__m128i*)shufmask1);
+  xmm6 = _mm_load_si128((__m128i*)andmask0);
+  xmm7 = _mm_load_si128((__m128i*)andmask1);
+  
+  __m128i *p_target, *p_src0;
+  
+  p_target = (__m128i*)target;
+  p_src0 = (__m128i*)src0;
+
+  int bound = num_bytes >> 5;
+  int intermediate = (num_bytes >> 4) & 1;
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  int i = 0;
+  
+  
+  for(i = 0; i < bound; ++i) {
+     
+    xmm0 = _mm_load_si128(p_src0);
+    xmm1 = _mm_load_si128(&p_src0[1]);
+    
+    
+
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 2;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);   
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    xmm3 = _mm_and_si128(xmm2, xmm7);
+    
+
+    xmm8 = _mm_add_epi8(xmm8, xmm4);
+    xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+    xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+    
+    
+    xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+    
+    _mm_store_si128(p_target, xmm3);
+    
+    p_target += 1;
+  
+  }
+
+  for(i = 0; i < intermediate; ++i) {
+    
+    xmm0 = _mm_load_si128(p_src0);
+    
+    
+    xmm2 = _mm_xor_si128(xmm2, xmm2);
+    p_src0 += 1;
+    
+    xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+    xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+    xmm8 = _mm_and_si128(xmm2, xmm6);
+    
+    xmm3 = _mm_add_epi8(xmm8, xmm4);
+    
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+    
+
+    _mm_storel_pd((double*)p_target, (__m128d)xmm0);
+    
+    p_target = (__m128i*)((int8_t*)p_target + 8);
+
+  }
+    
+  for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 
+    target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+  }
+  
+
+}   
+ 
+#endif /*LV_HAVE_SSSE3*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_max_star_horizontal_aligned16_generic(int16_t* target, int16_t* src0, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+      
+	for(i = 0; i < bound; i += 2) {
+	  target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];
+	}
+		
+}
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_VOLK_16s_MAX_STAR_HORIZONTAL_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..452d05c4f
--- /dev/null
+++ b/volk/include/volk/volk_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,139 @@
+#ifndef INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<xmmintrin.h>
+#include<emmintrin.h>
+
+static inline  void volk_16s_permute_and_scalar_add_aligned16_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+  short* p_permute_indexes = permute_indexes;
+  
+  p_target = (__m128i*)target;
+  p_cntl0 = (__m128i*)cntl0;
+  p_cntl1 = (__m128i*)cntl1;
+  p_cntl2 = (__m128i*)cntl2;
+  p_cntl3 = (__m128i*)cntl3;
+  p_scalars = (__m128i*)scalars;
+  
+  int i = 0;
+  
+  int bound = (num_bytes >> 4);
+  int leftovers = (num_bytes >> 1) & 7;
+  
+  xmm0 = _mm_load_si128(p_scalars);
+  
+  xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+  xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+  xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+  xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+  
+  xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+  xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+  xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+  xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+  for(; i < bound; ++i) {
+    xmm0 = _mm_setzero_si128();
+    xmm5 = _mm_setzero_si128();
+    xmm6 = _mm_setzero_si128();
+    xmm7 = _mm_setzero_si128();
+
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+    xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+    xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+    xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+    xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+    
+    p_permute_indexes += 8;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    xmm5 = _mm_load_si128(p_cntl0);
+    xmm6 = _mm_load_si128(p_cntl1);
+    xmm7 = _mm_load_si128(p_cntl2);
+    
+    xmm5 = _mm_and_si128(xmm5, xmm1);
+    xmm6 = _mm_and_si128(xmm6, xmm2);
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5);
+    
+    xmm5 = _mm_load_si128(p_cntl3);
+    
+    xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+    p_cntl0 += 1;
+    
+    xmm5 = _mm_and_si128(xmm5, xmm4);
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm6);
+    
+    p_cntl1 += 1;
+    p_cntl2 += 1;
+    
+    xmm0 = _mm_add_epi16(xmm0, xmm5); 
+    
+    p_cntl3 += 1;
+
+    _mm_store_si128(p_target, xmm0);
+    
+    p_target += 1;
+  }
+	
+	
+	
+	
+
+  for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+    target[i] = src0[permute_indexes[i]] 
+      + (cntl0[i] & scalars[0])
+      + (cntl1[i] & scalars[1])
+      + (cntl2[i] & scalars[2])
+      + (cntl3[i] & scalars[3]);
+  }
+}
+#endif /*LV_HAVE_SSEs*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_permute_and_scalar_add_aligned16_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+
+	for(i = 0; i < bound; ++i) {
+		target[i] = src0[permute_indexes[i]] 
+			+ (cntl0[i] & scalars[0])
+			+ (cntl1[i] & scalars[1])
+			+ (cntl2[i] & scalars[2])
+			+ (cntl3[i] & scalars[3]);
+		
+	}
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_16s_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16s_quad_max_star_aligned16.h b/volk/include/volk/volk_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..1004c4d23
--- /dev/null
+++ b/volk/include/volk/volk_16s_quad_max_star_aligned16.h
@@ -0,0 +1,191 @@
+#ifndef INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H
+
+
+#include<inttypes.h>
+#include<stdio.h>	
+
+
+
+
+
+#if LV_HAVE_SSE2
+
+#include<emmintrin.h>
+
+static inline  void volk_16s_quad_max_star_aligned16_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+
+
+
+	int i = 0;
+
+	int bound = (num_bytes >> 4);
+	int bound_copy = bound;
+	int leftovers = (num_bytes >> 1) & 7;
+	
+	__m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+	p_target = (__m128i*) target;
+	p_src0 =  (__m128i*)src0;
+	p_src1 =  (__m128i*)src1;
+	p_src2 =  (__m128i*)src2;
+	p_src3 =  (__m128i*)src3;
+	
+	
+
+	__m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+	while(bound_copy > 0) {
+	 
+	  xmm1 = _mm_load_si128(p_src0);
+	  xmm2 = _mm_load_si128(p_src1);
+	  xmm3 = _mm_load_si128(p_src2);
+	  xmm4 = _mm_load_si128(p_src3);
+	  
+	  xmm5 = _mm_setzero_si128();
+	  xmm6 = _mm_setzero_si128();
+	  xmm7 = xmm1;
+	  xmm8 = xmm3;
+	  
+	  
+	  xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+	  
+
+	  xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+	  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+	  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+	  
+
+	  xmm2 = _mm_and_si128(xmm5, xmm2);
+	  xmm4 = _mm_and_si128(xmm6, xmm4);
+	  xmm5 = _mm_andnot_si128(xmm5, xmm7);
+	  xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+	  xmm5 = _mm_add_epi16(xmm2, xmm5);
+	  xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+	  
+	  xmm1 = _mm_xor_si128(xmm1, xmm1);
+	  xmm2 = xmm5;
+	  xmm5 = _mm_sub_epi16(xmm6, xmm5);
+	  p_src0 += 1;
+	  bound_copy -= 1;
+
+	  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+	  p_src1 += 1;
+	  
+	  xmm6 = _mm_and_si128(xmm1, xmm6);
+	  	  
+	  xmm1 = _mm_andnot_si128(xmm1, xmm2);
+	  p_src2 += 1;
+
+
+	  
+	  xmm1 = _mm_add_epi16(xmm6, xmm1);
+	  p_src3 += 1;
+
+	  
+	  _mm_store_si128(p_target, xmm1);
+	  p_target += 1;
+      
+	}
+	
+
+	/*asm volatile
+		(
+		 "volk_16s_quad_max_star_aligned16_sse2_L1:\n\t"
+		 "cmp $0, %[bound]\n\t"
+		 "je volk_16s_quad_max_star_aligned16_sse2_END\n\t"
+
+		 "movaps (%[src0]), %%xmm1\n\t"
+		 "movaps (%[src1]), %%xmm2\n\t"
+		 "movaps (%[src2]), %%xmm3\n\t"
+		 "movaps (%[src3]), %%xmm4\n\t"
+
+		 "pxor %%xmm5, %%xmm5\n\t"
+		 "pxor %%xmm6, %%xmm6\n\t"
+		 "movaps %%xmm1, %%xmm7\n\t"
+		 "movaps %%xmm3, %%xmm8\n\t"
+		 "psubw %%xmm2, %%xmm1\n\t"
+		 "psubw %%xmm4, %%xmm3\n\t"
+		 
+		 "pcmpgtw %%xmm1, %%xmm5\n\t"
+		 "pcmpgtw %%xmm3, %%xmm6\n\t"
+		 
+		 "pand %%xmm5, %%xmm2\n\t"
+		 "pand %%xmm6, %%xmm4\n\t"
+		 "pandn %%xmm7, %%xmm5\n\t"
+		 "pandn %%xmm8, %%xmm6\n\t"
+		 
+		 "paddw %%xmm2, %%xmm5\n\t"
+		 "paddw %%xmm4, %%xmm6\n\t"
+
+		 "pxor %%xmm1, %%xmm1\n\t"
+		 "movaps %%xmm5, %%xmm2\n\t"
+		 
+		 "psubw %%xmm6, %%xmm5\n\t"
+		 "add $16, %[src0]\n\t"
+		 "add $-1, %[bound]\n\t"
+	  
+		 "pcmpgtw %%xmm5, %%xmm1\n\t"
+		 "add $16, %[src1]\n\t"
+
+		 "pand %%xmm1, %%xmm6\n\t"
+
+		 "pandn %%xmm2, %%xmm1\n\t"
+		 "add $16, %[src2]\n\t"
+
+		 "paddw %%xmm6, %%xmm1\n\t"
+		 "add $16, %[src3]\n\t"
+
+		 "movaps %%xmm1, (%[target])\n\t"
+		 "addw $16, %[target]\n\t"
+		 "jmp volk_16s_quad_max_star_aligned16_sse2_L1\n\t"
+		 
+		 "volk_16s_quad_max_star_aligned16_sse2_END:\n\t"
+		 :
+		 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
+		 :
+		 );
+	*/ 
+
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+	return;
+
+
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+
+#if LV_HAVE_GENERIC
+static inline void volk_16s_quad_max_star_aligned16_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_bytes) {
+	
+	int i = 0;
+	
+	int bound = num_bytes >> 1;
+	
+	short temp0 = 0;
+	short temp1 = 0;
+	for(i = 0; i < bound; ++i) {
+	  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+	  temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
+	  target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
+	}
+}
+
+
+
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_VOLK_16s_QUAD_MAX_STAR_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..32e13df98
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,146 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_aligned16_ssse3(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i qMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+  __m128i qMoveMask2 = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, iMoveMask1) , _mm_shuffle_epi8(complexVal2, iMoveMask2));
+    qOutputVal = _mm_or_si128( _mm_shuffle_epi8(complexVal1, qMoveMask1) , _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *int16ComplexVectorPtr++;
+    *qBufferPtr++ = *int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_aligned16_sse2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1, qComplexVal2, iOutputVal, qOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask), _mm_and_si128(iComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask), _mm_and_si128(qComplexVal2, highMask));
+
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..86f67437d
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,95 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t quarterPoints = num_points / 4;    
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+    __m128 invScalar = _mm_set_ps1(1.0/scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[8] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      
+      floatBuffer[0] = (float)(complexVectorPtr[0]);
+      floatBuffer[1] = (float)(complexVectorPtr[1]);
+      floatBuffer[2] = (float)(complexVectorPtr[2]);
+      floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+      floatBuffer[4] = (float)(complexVectorPtr[4]);
+      floatBuffer[5] = (float)(complexVectorPtr[5]);
+      floatBuffer[6] = (float)(complexVectorPtr[6]);
+      floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+      cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+      cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+      complexVectorPtr += 8;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+      cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      _mm_store_ps(iBufferPtr, iValue);
+      _mm_store_ps(qBufferPtr, qValue);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for(; number < num_points; number++){
+      *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+      *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the complex 16 bit vector into floats,scales each data point, and deinterleaves into I & Q vector data
+    \param complexVector The complex input vector
+    \param iBuffer The I buffer output data
+    \param qBuffer The Q buffer output data
+    \param scalar The data value to be divided against each input data value of the input complex vector
+    \param num_points The number of complex data values to be deinterleaved
+  */
+static inline void volk_16sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..b594c85b8
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_aligned16_ssse3(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+  __m128i complexVal1, complexVal2, iOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_aligned16_sse2(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i complexVal1, complexVal2, iOutputVal;
+  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+  unsigned int eighthPoints = num_points / 8;
+ 
+  for(number = 0; number < eighthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 8;
+
+    complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3,1,2,0));
+
+    complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2,0,3,1));
+
+    iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask), _mm_and_si128(complexVal2, highMask));
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..3e7be1e64
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,125 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+  for(;number < quarterPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi16_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+    sixteenTComplexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar = 1.0/scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..c0d1e941a
--- /dev/null
+++ b/volk/include/volk/volk_16sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,83 @@
+#ifndef INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i iMoveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+  __m128i iMoveMask2 = _mm_set_epi8(13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+    complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+    complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+    complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+    complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+    complexVal1 = _mm_srai_epi16(complexVal1, 8);
+    complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+    iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ / 256));
+    int16ComplexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 16 bit vector into 8 bit I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_16sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int16_t* complexVectorPtr = (int16_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int8_t)(*complexVectorPtr++ / 256);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_DEINTERLEAVE_REAL_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..1482ab82e
--- /dev/null
+++ b/volk/include/volk/volk_16sc_magnitude_16s_aligned16.h
@@ -0,0 +1,179 @@
+#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+   
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(32768.0);
+  __m128 invScalar = _mm_set_ps1(1.0/32768.0);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float inputFloatBuffer[4] __attribute__((aligned(128)));
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    cplxValue1 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+    cplxValue2 = _mm_load_ps(inputFloatBuffer);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    result = _mm_mul_ps(result, vScalar); // Scale the results
+
+    _mm_store_ps(outputFloatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    const float val1Real = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Imag = (float)(*complexVectorPtr++) / 32768.0;
+    const float val1Result = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * 32768.0;
+    *magnitudeVectorPtr++ = (int16_t)(val1Result);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_16sc_t* complexVector, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float scalar = 32786.0;
+  for(number = 0; number < num_points; number++){
+    float real = ((float)(*complexVectorPtr++)) / scalar;
+    float imag = ((float)(*complexVectorPtr++)) / scalar;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..29c58ceb8
--- /dev/null
+++ b/volk/include/volk/volk_16sc_magnitude_32f_aligned16.h
@@ -0,0 +1,163 @@
+#ifndef INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
+#define INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) / scalar;
+    float val1Imag = (float)(*complexVectorPtr++) / scalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+
+  const float iScalar = 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float inputFloatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+
+    inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+    inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+    inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+    inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+    inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+    inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+    inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result); // Square root the values
+
+    _mm_store_ps(magnitudeVectorPtr, result);
+      
+    magnitudeVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  complexVectorPtr = (const int16_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    float val1Real = (float)(*complexVectorPtr++) * iScalar;
+    float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+    *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param scalar The data value to be divided against each input data value of the input complex vector
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_16sc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_16sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    float real = ( (float) (*complexVectorPtr++)) * invScalar;
+    float imag = ( (float) (*complexVectorPtr++)) * invScalar;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_16sc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_16u_byteswap_aligned16.h b/volk/include/volk/volk_16u_byteswap_aligned16.h
new file mode 100644
index 000000000..698e958e4
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_aligned16.h
@@ -0,0 +1,65 @@
+#ifndef INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
+#define INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_aligned16_sse2(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+  uint16_t* inputPtr = intsToSwap;
+  __m128i input, left, right, output;
+
+  const unsigned int eighthPoints = num_points / 8;
+  for(;number < eighthPoints; number++){
+    // Load the 16t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_load_si128((__m128i*)inputPtr);
+    // Do the two shifts
+    left = _mm_slli_epi16(input, 8);
+    right = _mm_srli_epi16(input, 8);
+    // Or the left and right halves together
+    output = _mm_or_si128(left, right);
+    // Store the results
+    _mm_store_si128((__m128i*)inputPtr, output);
+    inputPtr += 8;
+  }
+
+  
+  // Byteswap any remaining points:
+  number = eighthPoints*8;  
+  for(; number < num_points; number++){
+    uint16_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int16_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_aligned16_generic(uint16_t* intsToSwap, unsigned int num_points){
+  unsigned int point;
+  uint16_t* inputPtr = intsToSwap;
+  for(point = 0; point < num_points; point++){
+    uint16_t output = *inputPtr;
+    output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_16u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_accumulator_aligned16.h b/volk/include/volk/volk_32f_accumulator_aligned16.h
new file mode 100644
index 000000000..7e395cf50
--- /dev/null
+++ b/volk/include/volk/volk_32f_accumulator_aligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
+#define INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Accumulates the values in the input buffer
+  \param result The accumulated result
+  \param inputBuffer The buffer of data to be accumulated
+  \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_aligned16_sse(float* result, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* aPtr = inputBuffer;
+  float tempBuffer[4] __attribute__((aligned(128)));
+  
+  __m128 accumulator = _mm_setzero_ps();
+  __m128 aVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+    aVal = _mm_load_ps(aPtr);
+    accumulator = _mm_add_ps(accumulator, aVal); 
+    aPtr += 4;
+  }
+  _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
+  returnValue = tempBuffer[0];
+  returnValue += tempBuffer[1];
+  returnValue += tempBuffer[2];
+  returnValue += tempBuffer[3];
+  
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    returnValue += (*aPtr++);
+  }
+  *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Accumulates the values in the input buffer
+  \param result The accumulated result
+  \param inputBuffer The buffer of data to be accumulated
+  \param num_points The number of values in inputBuffer to be accumulated
+*/
+static inline void volk_32f_accumulator_aligned16_generic(float* result, const float* inputBuffer, unsigned int num_points){
+  const float* aPtr = inputBuffer;
+  unsigned int number = 0;
+  float returnValue = 0;
+
+  for(;number < num_points; number++){
+    returnValue += (*aPtr++);
+  }
+  *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_ACCUMULATOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_add_aligned16.h b/volk/include/volk/volk_32f_add_aligned16.h
new file mode 100644
index 000000000..721c60fd6
--- /dev/null
+++ b/volk/include/volk/volk_32f_add_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32f_ADD_ALIGNED16_H
+#define INCLUDED_VOLK_32f_ADD_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_add_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_add_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Adds the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be added
+  \param bVector One of the vectors to be added
+  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
+*/
+static inline void volk_32f_add_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_ADD_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
new file mode 100644
index 000000000..ff917525f
--- /dev/null
+++ b/volk/include/volk/volk_32f_calc_spectral_noise_floor_aligned16.h
@@ -0,0 +1,167 @@
+#ifndef INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the spectral noise floor of an input power spectrum
+
+  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
+
+  \param realDataPoints The input power spectrum
+  \param num_points The number of data points in the input power spectrum vector
+  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_aligned16_sse(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* dataPointsPtr = realDataPoints;
+  float avgPointsVector[4] __attribute__((aligned(128)));
+    
+  __m128 dataPointsVal;
+  __m128 avgPointsVal = _mm_setzero_ps();
+  // Calculate the sum (for mean) for all points
+  for(; number < quarterPoints; number++){
+
+    dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+    dataPointsPtr += 4;
+
+    avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+  }
+
+  _mm_store_ps(avgPointsVector, avgPointsVal);
+
+  float sumMean = 0.0;
+  sumMean += avgPointsVector[0];
+  sumMean += avgPointsVector[1];
+  sumMean += avgPointsVector[2];
+  sumMean += avgPointsVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    sumMean += realDataPoints[number];
+  }
+
+  // calculate the spectral mean
+  // +20 because for the comparison below we only want to throw out bins
+  // that are significantly higher (and would, thus, affect the mean more
+  const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+  dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+  __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+  __m128 vOnesVector = _mm_set_ps1(1.0);
+  __m128 vValidBinCount = _mm_setzero_ps();
+  avgPointsVal = _mm_setzero_ps();
+  __m128 compareMask;
+  number = 0;
+  // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+  for(; number < quarterPoints; number++){
+
+    dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+    dataPointsPtr += 4;
+
+    // Identify which items do not exceed the mean amplitude
+    compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+    // Mask off the items that exceed the mean amplitude and add the avg Points that do not exceed the mean amplitude
+    avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+      
+    // Count the number of bins which do not exceed the mean amplitude
+    vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+  }
+    
+  // Calculate the mean from the remaining data points
+  _mm_store_ps(avgPointsVector, avgPointsVal);
+
+  sumMean = 0.0;
+  sumMean += avgPointsVector[0];
+  sumMean += avgPointsVector[1];
+  sumMean += avgPointsVector[2];
+  sumMean += avgPointsVector[3];
+
+  // Calculate the number of valid bins from the remaning count
+  float validBinCountVector[4] __attribute__((aligned(128)));
+  _mm_store_ps(validBinCountVector, vValidBinCount);
+
+  float validBinCount = 0;
+  validBinCount += validBinCountVector[0];
+  validBinCount += validBinCountVector[1];
+  validBinCount += validBinCountVector[2];
+  validBinCount += validBinCountVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    if(realDataPoints[number] <= meanAmplitude){
+      sumMean += realDataPoints[number];
+      validBinCount += 1.0;
+    }
+  }
+    
+  float localNoiseFloorAmplitude = 0;
+  if(validBinCount > 0.0){
+    localNoiseFloorAmplitude = sumMean / validBinCount;
+  }
+  else{
+    localNoiseFloorAmplitude = meanAmplitude; // For the odd case that all the amplitudes are equal...
+  }
+
+  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the spectral noise floor of an input power spectrum
+
+  Calculates the spectral noise floor of an input power spectrum by determining the mean of the input power spectrum, then recalculating the mean excluding any power spectrum values that exceed the mean by the spectralExclusionValue (in dB).  Provides a rough estimation of the signal noise floor.
+
+  \param realDataPoints The input power spectrum
+  \param num_points The number of data points in the input power spectrum vector
+  \param spectralExclusionValue The number of dB above the noise floor that a data point must be to be excluded from the noise floor calculation - default value is 20
+  \param noiseFloorAmplitude The noise floor of the input spectrum, in dB
+*/
+static inline void volk_32f_calc_spectral_noise_floor_aligned16_generic(float* noiseFloorAmplitude, const float* realDataPoints, const float spectralExclusionValue, const unsigned int num_points){
+  float sumMean = 0.0;
+  unsigned int number;
+  // find the sum (for mean), etc
+  for(number = 0; number < num_points; number++){
+    // sum (for mean)
+    sumMean += realDataPoints[number];
+  }
+
+  // calculate the spectral mean
+  // +20 because for the comparison below we only want to throw out bins
+  // that are significantly higher (and would, thus, affect the mean more)
+  const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+  // now throw out any bins higher than the mean
+  sumMean = 0.0;
+  unsigned int newNumDataPoints = num_points;
+  for(number = 0; number < num_points; number++){
+    if (realDataPoints[number] <= meanAmplitude)
+      sumMean += realDataPoints[number];
+    else
+      newNumDataPoints--;
+  }
+
+  float localNoiseFloorAmplitude = 0.0;
+  if (newNumDataPoints == 0)             // in the odd case that all
+    localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+  else
+    localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+  *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_aligned16.h b/volk/include/volk/volk_32f_convert_16s_aligned16.h
new file mode 100644
index 000000000..7fbabd9c3
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_16s_aligned16.h
@@ -0,0 +1,110 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_16s_aligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_16s_aligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    *outputVectorPtr++ = (int16_t)(*inputVectorPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_16s_aligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++ * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_16s_unaligned16.h b/volk/include/volk/volk_32f_convert_16s_unaligned16.h
new file mode 100644
index 000000000..d2bbdf13a
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_16s_unaligned16.h
@@ -0,0 +1,113 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_16s_unaligned16_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int eighthPoints = num_points / 8;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2;
+  __m128i intInputVal1, intInputVal2;
+
+  for(;number < eighthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 8;
+  }
+
+  number = eighthPoints * 8;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_16s_unaligned16_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int16_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int16_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int16_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_16s_unaligned16_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_16s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_aligned16.h b/volk/include/volk/volk_32f_convert_32s_aligned16.h
new file mode 100644
index 000000000..011ef5d0e
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_32s_aligned16.h
@@ -0,0 +1,106 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_32s_aligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_32s_aligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_32s_aligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_32s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_32s_unaligned16.h b/volk/include/volk/volk_32f_convert_32s_unaligned16.h
new file mode 100644
index 000000000..a6df826c7
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_32s_unaligned16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_32s_unaligned16_sse2(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1;
+  __m128i intInputVal1;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_32s_unaligned16_sse(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int32_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int32_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int32_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 32 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_32s_unaligned16_generic(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int32_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int32_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_32s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_aligned16.h b/volk/include/volk/volk_32f_convert_64f_aligned16.h
new file mode 100644
index 000000000..91a855813
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_aligned16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_aligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+ 
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_store_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_store_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_aligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_64f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_64f_unaligned16.h b/volk/include/volk/volk_32f_convert_64f_unaligned16.h
new file mode 100644
index 000000000..698e0d446
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_64f_unaligned16.h
@@ -0,0 +1,70 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_unaligned16_sse2(double* outputVector, const float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m128d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+ 
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+
+    inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+    ret = _mm_cvtps_pd(inputVal);
+
+    _mm_storeu_pd(outputVectorPtr, ret);
+    outputVectorPtr += 2;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the float values into double values
+  \param dVector The converted double vector values
+  \param fVector The float vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_32f_convert_64f_unaligned16_generic(double* outputVector, const float* inputVector, unsigned int num_points){
+  double* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_64f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_aligned16.h b/volk/include/volk/volk_32f_convert_8s_aligned16.h
new file mode 100644
index 000000000..b9487b622
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_8s_aligned16.h
@@ -0,0 +1,117 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_8s_aligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_8s_aligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_load_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32f_convert_8s_aligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_8s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_convert_8s_unaligned16.h b/volk/include/volk/volk_32f_convert_8s_unaligned16.h
new file mode 100644
index 000000000..e986dbc87
--- /dev/null
+++ b/volk/include/volk/volk_32f_convert_8s_unaligned16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_8s_unaligned16_sse2(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int sixteenthPoints = num_points / 16;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+  __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+
+  for(;number < sixteenthPoints; number++){
+    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal3 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+    inputVal4 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    intInputVal1 = _mm_cvtps_epi32(_mm_mul_ps(inputVal1, vScalar));
+    intInputVal2 = _mm_cvtps_epi32(_mm_mul_ps(inputVal2, vScalar));
+    intInputVal3 = _mm_cvtps_epi32(_mm_mul_ps(inputVal3, vScalar));
+    intInputVal4 = _mm_cvtps_epi32(_mm_mul_ps(inputVal4, vScalar));
+    
+    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+    intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+    intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+    outputVectorPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_8s_unaligned16_sse(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const float* inputVectorPtr = (const float*)inputVector;
+  int8_t* outputVectorPtr = outputVector;
+  __m128 vScalar = _mm_set_ps1(scalar);
+  __m128 ret;
+
+  float outputFloatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    ret = _mm_loadu_ps(inputVectorPtr);
+    inputVectorPtr += 4;
+
+    ret = _mm_mul_ps(ret, vScalar);
+
+    _mm_store_ps(outputFloatBuffer, ret);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[0]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[1]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[2]);
+    *outputVectorPtr++ = (int8_t)(outputFloatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (int8_t)(inputVector[number] * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 8 bit integer value
+    \param inputVector The floating point input data buffer
+    \param outputVector The 8 bit output data buffer
+    \param scalar The value multiplied against each point in the input buffer
+    \param num_points The number of data values to be converted
+    \note Input buffer does NOT need to be properly aligned
+  */
+static inline void volk_32f_convert_8s_unaligned16_generic(int8_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+  int8_t* outputVectorPtr = outputVector;
+  const float* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++  * scalar));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_CONVERT_8s_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_divide_aligned16.h b/volk/include/volk/volk_32f_divide_aligned16.h
new file mode 100644
index 000000000..c00700cd8
--- /dev/null
+++ b/volk/include/volk/volk_32f_divide_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
+#define INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_divide_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_div_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Divides the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be divideed
+  \param bVector The divisor vector
+  \param num_points The number of values in aVector and bVector to be divideed together and stored into cVector
+*/
+static inline void volk_32f_divide_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_DIVIDE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_dot_prod_aligned16.h b/volk/include/volk/volk_32f_dot_prod_aligned16.h
new file mode 100644
index 000000000..3aee1136a
--- /dev/null
+++ b/volk/include/volk/volk_32f_dot_prod_aligned16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
+#define INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_dot_prod_aligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_dot_prod_aligned16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_dot_prod_aligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_load_ps(aPtr); 
+    bVal = _mm_load_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_dot_prod_aligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){      
+
+    aVal1 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_load_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_VOLK_32f_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_dot_prod_unaligned16.h b/volk/include/volk/volk_32f_dot_prod_unaligned16.h
new file mode 100644
index 000000000..bce6aa15f
--- /dev/null
+++ b/volk/include/volk/volk_32f_dot_prod_unaligned16.h
@@ -0,0 +1,184 @@
+#ifndef INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
+#define INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H
+
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32f_dot_prod_unaligned16_generic(float * result, const float * input, const float * taps, unsigned int num_points) {
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr=  taps;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+  
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE
+
+
+static inline void volk_32f_dot_prod_unaligned16_sse( float* result, const  float* input, const  float* taps, unsigned int num_points) {
+  
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_add_ps(cVal, dotProdVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_dot_prod_unaligned16_sse3(float * result, const float * input, const float * taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal, bVal, cVal;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+      
+    aVal = _mm_loadu_ps(aPtr); 
+    bVal = _mm_loadu_ps(bPtr);
+      
+    cVal = _mm_mul_ps(aVal, bVal); 
+
+    dotProdVal = _mm_hadd_ps(dotProdVal, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  dotProdVal = _mm_hadd_ps(dotProdVal, dotProdVal);
+
+  _mm_store_ps(dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_dot_prod_unaligned16_sse4_1(float * result, const float * input, const float* taps, unsigned int num_points) {
+  unsigned int number = 0;
+  const unsigned int sixteenthPoints = num_points / 16;
+
+  float dotProduct = 0;
+  const float* aPtr = input;
+  const float* bPtr = taps;
+
+  __m128 aVal1, bVal1, cVal1;
+  __m128 aVal2, bVal2, cVal2;
+  __m128 aVal3, bVal3, cVal3;
+  __m128 aVal4, bVal4, cVal4;
+
+  __m128 dotProdVal = _mm_setzero_ps();
+
+  for(;number < sixteenthPoints; number++){
+      
+    aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+    aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
+
+    bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+    bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
+    
+    cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+    cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+    cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+    cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+    cVal1 = _mm_or_ps(cVal1, cVal2);
+    cVal3 = _mm_or_ps(cVal3, cVal4);
+    cVal1 = _mm_or_ps(cVal1, cVal3);
+
+    dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+  }
+
+  float dotProductVector[4] __attribute__((aligned(16)));
+  _mm_store_ps(dotProductVector, dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct = dotProductVector[0];
+  dotProduct += dotProductVector[1];
+  dotProduct += dotProductVector[2];
+  dotProduct += dotProductVector[3];
+
+  number = sixteenthPoints * 16;
+  for(;number < num_points; number++){
+    dotProduct += ((*aPtr++) * (*bPtr++));
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_VOLK_32f_DOT_PROD_UNALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_fm_detect_aligned16.h b/volk/include/volk/volk_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..c82239d74
--- /dev/null
+++ b/volk/include/volk/volk_32f_fm_detect_aligned16.h
@@ -0,0 +1,120 @@
+#ifndef INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
+#define INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+  \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+  \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+  \param num_noints The number of real values in the input vector.
+*/
+static inline void volk_32f_fm_detect_aligned16_sse(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 1;
+  unsigned int j = 0;
+  // num_points-1 keeps Fedora 7's gcc from crashing...
+  // num_points won't work.  :(
+  const unsigned int quarterPoints = (num_points-1) / 4;
+
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+  __m128 upperBound = _mm_set_ps1(bound);
+  __m128 lowerBound = _mm_set_ps1(-bound);
+  __m128 next3old1;
+  __m128 next4;
+  __m128 boundAdjust;
+  __m128 posBoundAdjust = _mm_set_ps1(-2*bound); // Subtract when we're above.
+  __m128 negBoundAdjust = _mm_set_ps1(2*bound); // Add when we're below.
+  // Do the first 4 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+  for (j = 1; j < ( (4 < num_points) ? 4 : num_points); j++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+    
+  for (; number < quarterPoints; number++) {
+    // Load data
+    next3old1 = _mm_loadu_ps((float*) (inPtr-1));
+    next4 = _mm_load_ps(inPtr);
+    inPtr += 4;
+    // Subtract and store:
+    next3old1 = _mm_sub_ps(next4, next3old1);
+    // Bound:
+    boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+    boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+    next4 = _mm_cmplt_ps(next3old1, lowerBound);
+    next4 = _mm_and_ps(next4, negBoundAdjust);
+    boundAdjust = _mm_or_ps(next4, boundAdjust);
+    // Make sure we're in the bounding interval:
+    next3old1 = _mm_add_ps(next3old1, boundAdjust);
+    _mm_store_ps(outPtr,next3old1); // Store the results back into the output
+    outPtr += 4;
+  }
+    
+  for (number = (4 > (quarterPoints*4) ? 4 : (4 * quarterPoints)); number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+    
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief performs the FM-detect differentiation on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing phase data (must be on the interval (-bound,bound] )
+  \param bound The interval that the input phase data is in, which is used to modulo the differentiation
+  \param saveValue A pointer to a float which contains the phase value of the sample before the first input sample.
+  \param num_points The number of real values in the input vector.
+*/
+static inline void volk_32f_fm_detect_aligned16_generic(float* outputVector, const float* inputVector, const float bound, float* saveValue, unsigned int num_points){
+  if (num_points < 1) {
+    return;
+  }
+  unsigned int number = 0;
+  float* outPtr = outputVector;
+  const float* inPtr = inputVector;
+      
+  // Do the first 1 by hand since we're going in from the saveValue:
+  *outPtr = *inPtr - *saveValue;
+  if (*outPtr >  bound) *outPtr -= 2*bound;
+  if (*outPtr < -bound) *outPtr += 2*bound;
+  inPtr++;
+  outPtr++;
+    
+  for (number = 1; number < num_points; number++) {
+    *outPtr = *(inPtr) - *(inPtr-1);
+    if (*outPtr >  bound) *outPtr -= 2*bound;
+    if (*outPtr < -bound) *outPtr += 2*bound;
+    inPtr++;
+    outPtr++;
+  }
+    
+  *saveValue = inputVector[num_points-1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_FM_DETECT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_index_max_aligned16.h b/volk/include/volk/volk_32f_index_max_aligned16.h
new file mode 100644
index 000000000..26322bfa2
--- /dev/null
+++ b/volk/include/volk/volk_32f_index_max_aligned16.h
@@ -0,0 +1,148 @@
+#ifndef INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include<smmintrin.h>
+
+static inline void volk_32f_index_max_aligned16_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)src0;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    float maxValuesBuffer[4] __attribute__((aligned(16)));
+    float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+    for(;number < quarterPoints; number++){
+
+      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
+      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+      maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+      maxValues      = _mm_blendv_ps(currentValues, maxValues, compareResults);
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for(number = 0; number < 4; number++){
+      if(maxValuesBuffer[number] > max){
+	index = maxIndexesBuffer[number];
+	max = maxValuesBuffer[number];
+      }
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){      
+      if(src0[number] > max){
+	index = number;
+	max = src0[number];
+      }
+    }
+    target[0] = (unsigned int)index;
+  }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#if LV_HAVE_SSE
+#include<xmmintrin.h>
+
+static inline void volk_32f_index_max_aligned16_sse(unsigned int* target, const float* src0, unsigned int num_points) {
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)src0;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    float maxValuesBuffer[4] __attribute__((aligned(16)));
+    float maxIndexesBuffer[4] __attribute__((aligned(16)));
+
+    for(;number < quarterPoints; number++){
+
+      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
+      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+      maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
+
+      maxValues      = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for(number = 0; number < 4; number++){
+      if(maxValuesBuffer[number] > max){
+	index = maxIndexesBuffer[number];
+	max = maxValuesBuffer[number];
+      }
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){      
+      if(src0[number] > max){
+	index = number;
+	max = src0[number];
+      }
+    }
+    target[0] = (unsigned int)index;
+  }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32f_index_max_aligned16_generic(unsigned int* target, const float* src0, unsigned int num_points) {
+  if(num_points > 0){
+    float max = src0[0];
+    unsigned int index = 0;
+    
+    int i = 1; 
+    
+    for(; i < num_points; ++i) {
+      
+      if(src0[i] > max){
+	index = i;
+	max = src0[i];
+      }
+
+    }
+    target[0] = index;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32F_INDEX_MAX_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32f_interleave_16sc_aligned16.h b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h
new file mode 100644
index 000000000..476946b88
--- /dev/null
+++ b/volk/include/volk/volk_32f_interleave_16sc_aligned16.h
@@ -0,0 +1,155 @@
+#ifndef INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H
+#define INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_interleave_16sc_aligned16_sse2(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue1, cplxValue2;
+    __m128i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+      intValue1 = _mm_cvtps_epi32(cplxValue1);
+      intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+      intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+      _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+      complexVectorPtr += 8;
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_interleave_16sc_aligned16_sse(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+    
+    __m128 iValue, qValue, cplxValue;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    float floatBuffer[4] __attribute__((aligned(128)));
+
+    for(;number < quarterPoints; number++){
+      iValue = _mm_load_ps(iBufferPtr);
+      qValue = _mm_load_ps(qBufferPtr);
+
+      // Interleaves the lower two values in the i and q variables into one buffer
+      cplxValue = _mm_unpacklo_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+      _mm_store_ps(floatBuffer, cplxValue);
+
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      // Interleaves the upper two values in the i and q variables into one buffer
+      cplxValue = _mm_unpackhi_ps(iValue, qValue);
+      cplxValue = _mm_mul_ps(cplxValue, vScalar);
+ 
+      _mm_store_ps(floatBuffer, cplxValue);
+      
+      *complexVectorPtr++ = (int16_t)(floatBuffer[0]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[1]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[2]);
+      *complexVectorPtr++ = (int16_t)(floatBuffer[3]);
+
+      iBufferPtr += 4;
+      qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for(; number < num_points; number++){
+      *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+      *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+    }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Interleaves the I & Q vector data into the complex vector, scales the output values by the scalar, and converts to 16 bit data.
+    \param iBuffer The I buffer data to be interleaved
+    \param qBuffer The Q buffer data to be interleaved
+    \param complexVector The complex output vector
+    \param scalar The scaling value being multiplied against each data point
+    \param num_points The number of complex data values to be interleaved
+  */
+static inline void volk_32f_interleave_16sc_aligned16_generic(lv_16sc_t* complexVector, const float* iBuffer, const float* qBuffer, const float scalar, unsigned int num_points){
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_INTERLEAVE_16SC_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_interleave_32fc_aligned16.h b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h
new file mode 100644
index 000000000..859c6a0ef
--- /dev/null
+++ b/volk/include/volk/volk_32f_interleave_32fc_aligned16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H
+#define INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_interleave_32fc_aligned16_sse(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  unsigned int number = 0;
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+
+  const uint64_t quarterPoints = num_points / 4;
+    
+  __m128 iValue, qValue, cplxValue;
+  for(;number < quarterPoints; number++){
+    iValue = _mm_load_ps(iBufferPtr);
+    qValue = _mm_load_ps(qBufferPtr);
+
+    // Interleaves the lower two values in the i and q variables into one buffer
+    cplxValue = _mm_unpacklo_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    // Interleaves the upper two values in the i and q variables into one buffer
+    cplxValue = _mm_unpackhi_ps(iValue, qValue);
+    _mm_store_ps(complexVectorPtr, cplxValue);
+    complexVectorPtr += 4;
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Interleaves the I & Q vector data into the complex vector.
+  \param iBuffer The I buffer data to be interleaved
+  \param qBuffer The Q buffer data to be interleaved
+  \param complexVector The complex output vector
+  \param num_points The number of complex data values to be interleaved
+*/
+static inline void volk_32f_interleave_32fc_aligned16_generic(lv_32fc_t* complexVector, const float* iBuffer, const float* qBuffer, unsigned int num_points){
+  float* complexVectorPtr = (float*)complexVector;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+  unsigned int number;
+
+  for(number = 0; number < num_points; number++){
+    *complexVectorPtr++ = *iBufferPtr++;
+    *complexVectorPtr++ = *qBufferPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_INTERLEAVE_32FC_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_max_aligned16.h b/volk/include/volk/volk_32f_max_aligned16.h
new file mode 100644
index 000000000..96aafb2bf
--- /dev/null
+++ b/volk/include/volk/volk_32f_max_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_32f_MAX_ALIGNED16_H
+#define INCLUDED_VOLK_32f_MAX_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_max_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_max_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_max_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_32f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_min_aligned16.h b/volk/include/volk/volk_32f_min_aligned16.h
new file mode 100644
index 000000000..e247f4213
--- /dev/null
+++ b/volk/include/volk/volk_32f_min_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_32f_MIN_ALIGNED16_H
+#define INCLUDED_VOLK_32f_MIN_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_min_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_min_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_32f_min_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const float a = *aPtr++;
+      const float b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_32f_MIN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_multiply_aligned16.h b/volk/include/volk/volk_32f_multiply_aligned16.h
new file mode 100644
index 000000000..b557580ab
--- /dev/null
+++ b/volk/include/volk/volk_32f_multiply_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H
+#define INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_multiply_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_mul_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param bVector One of the vectors to be multiplied
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_multiply_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_normalize_aligned16.h b/volk/include/volk/volk_32f_normalize_aligned16.h
new file mode 100644
index 000000000..1aabb1d9d
--- /dev/null
+++ b/volk/include/volk/volk_32f_normalize_aligned16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H
+#define INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Normalizes all points in the buffer by the scalar value ( divides each data point by the scalar value )
+  \param vecBuffer The buffer of values to be vectorized
+  \param num_points The number of values in vecBuffer
+  \param scalar The scale value to be applied to each buffer value
+*/
+static inline void volk_32f_normalize_aligned16_sse(float* vecBuffer, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* inputPtr = vecBuffer;
+
+  const float invScalar = 1.0 / scalar;
+  __m128 vecScalar = _mm_set_ps1(invScalar);
+
+  __m128 input1;
+
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+
+    input1 = _mm_load_ps(inputPtr);
+
+    input1 = _mm_mul_ps(input1, vecScalar);
+
+    _mm_store_ps(inputPtr, input1);
+
+    inputPtr += 4;
+  }
+
+  number = quarterPoints*4;
+  for(; number < num_points; number++){
+    *inputPtr *= invScalar;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Normalizes the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be normalizeed
+  \param bVector One of the vectors to be normalizeed
+  \param num_points The number of values in aVector and bVector to be normalizeed together and stored into cVector
+*/
+static inline void volk_32f_normalize_aligned16_generic(float* vecBuffer, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* inputPtr = vecBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *inputPtr *= invScalar;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_NORMALIZE_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_power_aligned16.h b/volk/include/volk/volk_32f_power_aligned16.h
new file mode 100644
index 000000000..2ecd8eecb
--- /dev/null
+++ b/volk/include/volk/volk_32f_power_aligned16.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_VOLK_32f_POWER_ALIGNED16_H
+#define INCLUDED_VOLK_32f_POWER_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <tmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_power_aligned16_sse4_1(float* cVector, const float* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  __m128 zeroValue = _mm_setzero_ps();
+  __m128 signMask;
+  __m128 negatedValues;
+  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+  __m128 onesMask = _mm_set_ps1(1);
+  
+  __m128 aVal, cVal;
+  for(;number < quarterPoints; number++){
+    
+    aVal = _mm_load_ps(aPtr);
+    signMask = _mm_cmplt_ps(aVal, zeroValue);
+    negatedValues = _mm_sub_ps(zeroValue, aVal);
+    aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+    
+    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+    cVal = _mm_mul_ps( _mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+    
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  for(;number < num_points; number++){
+    *cPtr++ = powf((*aPtr++), power);
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32f_power_aligned16_sse(float* cVector, const float* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  __m128 zeroValue = _mm_setzero_ps();
+  __m128 signMask;
+  __m128 negatedValues;
+  __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+  __m128 onesMask = _mm_set_ps1(1);
+  
+  __m128 aVal, cVal;
+  for(;number < quarterPoints; number++){
+    
+    aVal = _mm_load_ps(aPtr);
+    signMask = _mm_cmplt_ps(aVal, zeroValue);
+    negatedValues = _mm_sub_ps(zeroValue, aVal);
+    aVal = _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues) );
+    
+    // powf4 doesn't support negative values in the base, so we mask them off and then apply the negative after
+    cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+    cVal = _mm_mul_ps( _mm_or_ps( _mm_andnot_ps(signMask, onesMask), _mm_and_ps(signMask, negativeOneToPower) ), cVal);
+
+    _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+    
+    aPtr += 4;
+    cPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  for(;number < num_points; number++){
+    *cPtr++ = powf((*aPtr++), power);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Takes each the input vector value to the specified power and stores the results in the return vector
+    \param cVector The vector where the results will be stored
+    \param aVector The vector of values to be taken to a power
+    \param power The power value to be applied to each data point
+    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+  */
+static inline void volk_32f_power_aligned16_generic(float* cVector, const float* aVector, const float power, unsigned int num_points){
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = powf((*aPtr++), power);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_POWER_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_sqrt_aligned16.h b/volk/include/volk/volk_32f_sqrt_aligned16.h
new file mode 100644
index 000000000..0b2eaf251
--- /dev/null
+++ b/volk/include/volk/volk_32f_sqrt_aligned16.h
@@ -0,0 +1,64 @@
+#ifndef INCLUDED_VOLK_32f_SQRT_ALIGNED16_H
+#define INCLUDED_VOLK_32f_SQRT_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_aligned16_sse(float* cVector, const float* aVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      
+      cVal = _mm_sqrt_ps(aVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Sqrts the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be sqrted
+  \param num_points The number of values in aVector and bVector to be sqrted together and stored into cVector
+*/
+static inline void volk_32f_sqrt_aligned16_generic(float* cVector, const float* aVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_SQRT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_stddev_aligned16.h b/volk/include/volk/volk_32f_stddev_aligned16.h
new file mode 100644
index 000000000..1c6a08437
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_aligned16.h
@@ -0,0 +1,144 @@
+#ifndef INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H
+#define INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation of the input buffer using the supplied mean
+  \param stddev The calculated standard deviation
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param mean The mean of the input buffer
+  \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_stddev_aligned16_sse4_1(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+  float returnValue = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;    
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      aPtr++;
+    }
+    returnValue /= num_points;
+    returnValue -= (mean * mean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation of the input buffer using the supplied mean
+  \param stddev The calculated standard deviation
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param mean The mean of the input buffer
+  \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_stddev_aligned16_sse(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+  float returnValue = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      aPtr++;
+    }
+    returnValue /= num_points;
+    returnValue -= (mean * mean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation of the input buffer using the supplied mean
+  \param stddev The calculated standard deviation
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param mean The mean of the input buffer
+  \param num_points The number of values in input buffer to used in the stddev calculation
+*/
+static inline void volk_32f_stddev_aligned16_generic(float* stddev, const float* inputBuffer, const float mean, unsigned int num_points){
+  float returnValue = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+      
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      aPtr++;
+    }
+
+    returnValue /= num_points;
+    returnValue -= (mean * mean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_STDDEV_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
new file mode 100644
index 000000000..1cd502257
--- /dev/null
+++ b/volk/include/volk/volk_32f_stddev_and_mean_aligned16.h
@@ -0,0 +1,169 @@
+#ifndef INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
+#define INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_sse4_1(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal1, aVal2, aVal3, aVal4;
+    __m128 cVal1, cVal2, cVal3, cVal4;
+    for(;number < sixteenthPoints; number++) {
+      aVal1 = _mm_load_ps(aPtr); aPtr += 4;   
+      cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+      accumulator = _mm_add_ps(accumulator, aVal1);  // accumulator += x
+
+      aVal2 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+      accumulator = _mm_add_ps(accumulator, aVal2);  // accumulator += x
+
+      aVal3 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+      accumulator = _mm_add_ps(accumulator, aVal3);  // accumulator += x
+
+      aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+      cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+      accumulator = _mm_add_ps(accumulator, aVal4);  // accumulator += x
+
+      cVal1 = _mm_or_ps(cVal1, cVal2);
+      cVal3 = _mm_or_ps(cVal3, cVal4);
+      cVal1 = _mm_or_ps(cVal1, cVal3);
+
+      squareAccumulator = _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = sixteenthPoints * 16;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_sse(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    float meanBuffer[4] __attribute__((aligned(128)));
+    float squareBuffer[4] __attribute__((aligned(128)));
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 squareAccumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+    for(;number < quarterPoints; number++) {
+      aVal = _mm_load_ps(aPtr);                     // aVal = x
+      accumulator = _mm_add_ps(accumulator, aVal);  // accumulator += x
+      aVal = _mm_mul_ps(aVal, aVal);                // squareAccumulator += x^2
+      squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+      aPtr += 4;
+    }
+    _mm_store_ps(meanBuffer,accumulator); // Store the results back into the C container
+    _mm_store_ps(squareBuffer,squareAccumulator); // Store the results back into the C container  
+    newMean = meanBuffer[0];
+    newMean += meanBuffer[1];
+    newMean += meanBuffer[2];
+    newMean += meanBuffer[3];
+    returnValue = squareBuffer[0];
+    returnValue += squareBuffer[1];
+    returnValue += squareBuffer[2];
+    returnValue += squareBuffer[3];
+  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the standard deviation and mean of the input buffer
+  \param stddev The calculated standard deviation
+  \param mean The mean of the input buffer
+  \param inputBuffer The buffer of points to calculate the std deviation for
+  \param num_points The number of values in input buffer to used in the stddev and mean calculations
+*/
+static inline void volk_32f_stddev_and_mean_aligned16_generic(float* stddev, float* mean, const float* inputBuffer, unsigned int num_points){
+  float returnValue = 0;
+  float newMean = 0;
+  if(num_points > 0){
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    
+    for(number = 0; number < num_points; number++){
+      returnValue += (*aPtr) * (*aPtr);
+      newMean += *aPtr++;
+    }
+    newMean /= num_points;
+    returnValue /= num_points;
+    returnValue -= (newMean * newMean);
+    returnValue = sqrt(returnValue);
+  }
+  *stddev = returnValue;
+  *mean = newMean;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32f_STDDEV_AND_MEAN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_subtract_aligned16.h b/volk/include/volk/volk_32f_subtract_aligned16.h
new file mode 100644
index 000000000..ac3f5e5d1
--- /dev/null
+++ b/volk/include/volk/volk_32f_subtract_aligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H
+#define INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_subtract_aligned16_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_sub_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Subtracts bVector form aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The initial vector
+  \param bVector The vector to be subtracted
+  \param num_points The number of values in aVector and bVector to be subtracted together and stored into cVector
+*/
+static inline void volk_32f_subtract_aligned16_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_32f_SUBTRACT_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32f_sum_of_poly_aligned16.h b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h
new file mode 100644
index 000000000..a326e62b1
--- /dev/null
+++ b/volk/include/volk/volk_32f_sum_of_poly_aligned16.h
@@ -0,0 +1,151 @@
+#ifndef INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H
+#define INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X,Y) ((X) > (Y)?(X):(Y))
+#endif
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32f_sum_of_poly_aligned16_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+  
+  
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+  
+  
+  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
+
+  xmm9 = _mm_setzero_ps();
+  xmm1 = _mm_setzero_ps();
+  
+  xmm0 = _mm_load1_ps(&center_point_array[0]);
+  xmm6 = _mm_load1_ps(&center_point_array[1]);
+  xmm7 = _mm_load1_ps(&center_point_array[2]);
+  xmm8 = _mm_load1_ps(&center_point_array[3]);
+  //xmm11 = _mm_load1_ps(&center_point_array[4]);
+  xmm10 = _mm_load1_ps(cutoff);
+  
+  int bound = num_bytes >> 4;
+  int leftovers = (num_bytes >> 2) & 3;
+  int i = 0;
+  
+  for(; i < bound; ++i) {
+    xmm2 = _mm_load_ps(src0);
+    xmm2 = _mm_max_ps(xmm10, xmm2);
+    xmm3 = _mm_mul_ps(xmm2, xmm2);
+    xmm4 = _mm_mul_ps(xmm2, xmm3);
+    xmm5 = _mm_mul_ps(xmm3, xmm3);
+    //xmm12 = _mm_mul_ps(xmm3, xmm4);
+
+    xmm2 = _mm_mul_ps(xmm2, xmm0);
+    xmm3 = _mm_mul_ps(xmm3, xmm6);
+    xmm4 = _mm_mul_ps(xmm4, xmm7);
+    xmm5 = _mm_mul_ps(xmm5, xmm8);
+    //xmm12 = _mm_mul_ps(xmm12, xmm11);
+
+    xmm2 = _mm_add_ps(xmm2, xmm3);
+    xmm3 = _mm_add_ps(xmm4, xmm5);
+    
+    src0 += 4;
+    
+    xmm9 = _mm_add_ps(xmm2, xmm9);
+    
+    xmm1 = _mm_add_ps(xmm3, xmm1);
+
+    //xmm9 = _mm_add_ps(xmm12, xmm9);
+  }
+  
+  xmm2 = _mm_hadd_ps(xmm9, xmm1);
+  xmm3 = _mm_hadd_ps(xmm2, xmm2);
+  xmm4 = _mm_hadd_ps(xmm3, xmm3);
+
+  _mm_store_ss(&result, xmm4);
+    
+  
+
+  for(i = 0; i < leftovers; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth);// + 
+	       //center_point_array[4] * fith);
+  }
+
+  result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
+
+  target[0] = result;
+}
+ 
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32f_sum_of_poly_aligned16_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_bytes) {
+
+
+    
+  float result = 0.0;
+  float fst = 0.0;
+  float sq = 0.0;
+  float thrd = 0.0;
+  float frth = 0.0;
+  //float fith = 0.0;
+  
+
+
+  int i = 0; 
+  
+  for(; i < num_bytes >> 2; ++i) {
+    fst = src0[i];
+    fst = MAX(fst, *cutoff);
+    
+    sq = fst * fst;
+    thrd = fst * sq;
+    frth = sq * sq;
+    //fith = sq * thrd;
+    
+    result += (center_point_array[0] * fst + 
+	       center_point_array[1] * sq + 
+	       center_point_array[2] * thrd + 
+	       center_point_array[3] * frth); //+
+	       //center_point_array[4] * fith);
+    /*printf("%f12...%d\n", (center_point_array[0] * fst + 
+		  center_point_array[1] * sq + 
+		  center_point_array[2] * thrd + 
+			 center_point_array[3] * frth) +
+	   //center_point_array[4] * fith) + 
+	   (center_point_array[4]), i);
+    */
+  }
+
+  result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
+
+  
+  
+  *target = result;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32F_SUM_OF_POLY_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_32f_multiply_aligned16.h b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
new file mode 100644
index 000000000..436656ca0
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_multiply_aligned16.h
@@ -0,0 +1,82 @@
+#ifndef INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Multiplies the input complex vector with the input float vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the float values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32f_multiply_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr=  bVector;
+
+    __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal1 = _mm_load_ps((const float*)aPtr);
+      aPtr += 2;
+ 
+      aVal2 = _mm_load_ps((const float*)aPtr); 
+      aPtr += 2;
+
+      bVal = _mm_load_ps(bPtr);
+      bPtr += 4;
+
+      bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1,1,0,0));
+      bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3,3,2,2));
+
+      cVal = _mm_mul_ps(aVal1, bVal1); 
+      
+      _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+      cPtr += 2;
+
+      cVal = _mm_mul_ps(aVal2, bVal2); 
+      
+      _mm_store_ps((float*)cPtr,cVal); // Store the results back into the C container
+
+      cPtr += 2;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr);
+      bPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the input complex vector with the input lv_32fc_t vector and store their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector to be multiplied
+    \param bVector The vectors containing the lv_32fc_t values to be multiplied against each complex value in aVector
+    \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_32f_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float* bVector, unsigned int num_points){
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+  const float* bPtr=  bVector;
+  unsigned int number = 0;
+  
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = (*aPtr++) * (*bPtr++);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_32f_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
new file mode 100644
index 000000000..2d71ee4f8
--- /dev/null
+++ b/volk/include/volk/volk_32fc_32f_power_32fc_aligned16.h
@@ -0,0 +1,109 @@
+#ifndef INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+  \param cVector The vector where the results will be stored
+  \param aVector The complex vector of values to be taken to a power
+  \param power The power value to be applied to each data point
+  \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+*/
+static inline void volk_32fc_32f_power_32fc_aligned16_sse(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+  
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 vPower = _mm_set_ps1(power);
+  
+  __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+  for(;number < quarterPoints; number++){
+    
+    cplxValue1 = _mm_load_ps((float*)aPtr); 
+    aPtr += 2;
+    
+    cplxValue2 = _mm_load_ps((float*)aPtr); 
+    aPtr += 2;
+    
+    // Convert to polar coordinates
+    
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+    
+    phase = atan2f4(qValue, iValue); // Calculate the Phase
+    
+    magnitude = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(iValue, iValue), _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square rooting the added I2 and Q2 values
+    
+    // Now calculate the power of the polar coordinate data
+    magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+    
+    phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+    
+    // Convert back to cartesian coordinates
+    iValue = _mm_mul_ps( cosf4(phase), magnitude); // Multiply the cos of the phase by the magnitude
+    qValue = _mm_mul_ps( sinf4(phase), magnitude); // Multiply the sin of the phase by the magnitude
+    
+    cplxValue1 = _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+    cplxValue2 = _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+    
+    _mm_store_ps((float*)cPtr,cplxValue1); // Store the results back into the C container
+    
+    cPtr += 2;
+    
+    _mm_store_ps((float*)cPtr,cplxValue2); // Store the results back into the C container
+    
+    cPtr += 2;
+  }
+  
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+  lv_32fc_t complexPower;
+  ((float*)&complexPower)[0] = power;
+  ((float*)&complexPower)[1] = 0;
+  for(;number < num_points; number++){
+    *cPtr++ = lv_cpow((*aPtr++), complexPower);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Takes each the input complex vector value to the specified power and stores the results in the return vector
+    \param cVector The vector where the results will be stored
+    \param aVector The complex vector of values to be taken to a power
+    \param power The power value to be applied to each data point
+    \param num_points The number of values in aVector to be taken to the specified power level and stored into cVector
+  */
+static inline void volk_32fc_32f_power_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float power, unsigned int num_points){
+  lv_32fc_t* cPtr = cVector;
+  const lv_32fc_t* aPtr = aVector;
+  unsigned int number = 0;
+  lv_32fc_t complexPower;
+  ((float*)&complexPower)[0] = power;
+  ((float*)&complexPower)[1] = 0.0;
+
+  for(number = 0; number < num_points; number++){
+    *cPtr++ = lv_cpow((*aPtr++), complexPower);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_32f_POWER_32fc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_atan2_32f_aligned16.h b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h
new file mode 100644
index 000000000..df0ebb987
--- /dev/null
+++ b/volk/include/volk/volk_32fc_atan2_32f_aligned16.h
@@ -0,0 +1,158 @@
+#ifndef INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief performs the atan2 on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+  \param normalizeFactor The atan2 results will be divided by this normalization factor.
+  \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_atan2_32f_aligned16_sse4_1(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* outPtr = outputVector;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 testVector = _mm_set_ps1(2*M_PI);
+  __m128 correctVector = _mm_set_ps1(M_PI);
+  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+  __m128 phase;
+  __m128 complex1, complex2, iValue, qValue;
+  __m128 keepMask;
+    
+  for (; number < quarterPoints; number++) {
+    // Load IQ data:
+    complex1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    complex2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    // Deinterleave IQ data:
+    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); 
+    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); 
+    // Arctan to get phase:
+    phase = atan2f4(qValue, iValue);
+    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+    // Compare to 2pi:
+    keepMask = _mm_cmpneq_ps(phase,testVector);
+    phase = _mm_blendv_ps(correctVector, phase, keepMask);
+    // done with above correction.      
+    phase = _mm_mul_ps(phase, vNormalizeFactor);
+    _mm_store_ps((float*)outPtr, phase);
+    outPtr += 4;
+  }
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+  for (; number < num_points; number++) {
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief performs the atan2 on the input vector and stores the results in the output vector.
+  \param outputVector The byte-aligned vector where the results will be stored.
+  \param inputVector The byte-aligned input vector containing interleaved IQ data (I = cos, Q = sin).
+  \param normalizeFactor The atan2 results will be divided by this normalization factor.
+  \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_atan2_32f_aligned16_sse(float* outputVector,  const lv_32fc_t* complexVector, const float normalizeFactor, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* outPtr = outputVector;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 testVector = _mm_set_ps1(2*M_PI);
+  __m128 correctVector = _mm_set_ps1(M_PI);
+  __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+  __m128 phase;
+  __m128 complex1, complex2, iValue, qValue;
+  __m128 mask;
+  __m128 keepMask;
+    
+  for (; number < quarterPoints; number++) {
+    // Load IQ data:
+    complex1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    complex2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+    // Deinterleave IQ data:
+    iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2,0,2,0)); 
+    qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3,1,3,1)); 
+    // Arctan to get phase:
+    phase = atan2f4(qValue, iValue);
+    // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+    // Compare to 2pi:
+    keepMask = _mm_cmpneq_ps(phase,testVector);
+    phase = _mm_and_ps(phase, keepMask);
+    mask = _mm_andnot_ps(keepMask, correctVector);
+    phase = _mm_or_ps(phase, mask);
+    // done with above correction.      
+    phase = _mm_mul_ps(phase, vNormalizeFactor);
+    _mm_store_ps((float*)outPtr, phase);
+    outPtr += 4;
+  }
+  number = quarterPoints * 4;
+#endif /* LV_HAVE_SIMDMATH_H */
+
+  for (; number < num_points; number++) {
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief performs the atan2 on the input vector and stores the results in the output vector.
+  \param outputVector The vector where the results will be stored.
+  \param inputVector Input vector containing interleaved IQ data (I = cos, Q = sin).
+  \param normalizeFactor The atan2 results will be divided by this normalization factor.
+  \param num_points The number of complex values in the input vector.
+*/
+static inline void volk_32fc_atan2_32f_aligned16_generic(float* outputVector, const lv_32fc_t* inputVector, const float normalizeFactor, unsigned int num_points){
+  float* outPtr = outputVector;
+  const float* inPtr = (float*)inputVector;
+  const float invNormalizeFactor = 1.0 / normalizeFactor;
+  unsigned int number;
+  for ( number = 0; number < num_points; number++) {
+    const float real = *inPtr++;
+    const float imag = *inPtr++;
+    *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_ATAN2_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
new file mode 100644
index 000000000..60103c1b5
--- /dev/null
+++ b/volk/include/volk/volk_32fc_conjugate_dot_prod_aligned16.h
@@ -0,0 +1,344 @@
+#ifndef INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H
+
+#include<volk/volk_complex.h>
+#include<stdio.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_conjugate_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+    sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+    sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+ 
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+
+  }
+  /*
+  for(i = 0; i < num_bytes >> 3; ++i) {
+    *result += input[i] * conjf(taps[i]);
+  }
+  */
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_conjugate_dot_prod_aligned16_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  
+
+
+
+  asm volatile 
+    (
+     "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %[conjugator], %%r9\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movaps 0(%%r9), %%xmm8\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movups	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "  xorps  %%xmm8, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "  xorps   %%xmm8, %%xmm3\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     "  xorps   %%xmm8, %%xmm2\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+  
+  }
+
+  return;
+}  
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_conjugate_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+
+  int bound = num_bytes >> 4;
+  int leftovers = num_bytes % 16;
+
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	#movl	12(%%ebp), %%eax		# input\n\t"
+     "	#movl	16(%%ebp), %%edx		# taps\n\t"
+     "	#movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%[eax]), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%[edx]), %%xmm2\n\t"
+     "  movl    %[ecx], (%[out])\n\t"
+     "	shrl	$5, %[ecx]		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%[eax]), %%xmmA\n\t"
+     "#	movaps	(%[edx]), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%[edx]), %%xmm3\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "  xorps   %%xmm1, %%xmm3\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	movaps	16(%[eax]), %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "  movaps  0(%[conjugator]), %%xmm1\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%[eax]), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %[eax]\n\t"
+     "	movaps	32(%[edx]), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "  xorps   %%xmm1, %%xmm2\n\t"
+     "	addl	$32, %[edx]\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%[ecx]\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	0(%[out]), %[ecx]		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %[ecx]\n\t"
+     "	andl	$1, %[ecx]\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  #movl 8(%%ebp), %[eax] \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%[out])\n\t"
+     "	movss	(%[out]), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %[eax]		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[out])		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
+     );
+
+  
+  
+  
+  printf("%d, %d\n", leftovers, bound);
+  
+  for(; leftovers > 0; leftovers -= 8) {
+    
+    
+    *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+
+
+#endif /*INCLUDED_VOLK_32fc_CONJUGATE_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..02085cd1e
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_32f_aligned16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    *qBufferPtr++ = *complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
new file mode 100644
index 000000000..3d9ebccdd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_64f_aligned16.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_aligned16_sse2(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;    
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for(;number < halfPoints; number++){
+      
+      cplxValue = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i1i2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(iBufferPtr, dVal);
+
+      // Arrange in q1q2q1q2 format
+      fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3,1,3,1));
+      dVal = _mm_cvtps_pd(fVal); 
+      _mm_store_pd(qBufferPtr, dVal);
+
+      iBufferPtr += 2;
+      qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(; number < num_points; number++){
+      *iBufferPtr++ = *complexVectorPtr++;
+      *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_aligned16_generic(double* iBuffer, double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  double* qBufferPtr = qBuffer;
+
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    *qBufferPtr++ = (double)*complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_64F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..3026b2422
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,80 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16s_aligned16_sse(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    iValue = _mm_mul_ps(iValue, vScalar);
+
+    _mm_store_ps(floatBuffer, iValue);
+    *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+    *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  iBufferPtr = &iBuffer[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector, multiply the value by the scalar, convert to 16t, and in I vector data
+  \param complexVector The complex input vector
+  \param scalar The value to be multiply against each of the input values
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+    complexVectorPtr++;
+  }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..2af973bcc
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,68 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  float* iBufferPtr = iBuffer;
+
+  __m128 cplxValue1, cplxValue2, iValue;
+  for(;number < quarterPoints; number++){
+      
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  float* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
new file mode 100644
index 000000000..f408589c4
--- /dev/null
+++ b/volk/include/volk/volk_32fc_deinterleave_real_64f_aligned16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_aligned16_sse2(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+
+  const unsigned int halfPoints = num_points / 2;    
+  __m128 cplxValue, fVal;
+  __m128d dVal;
+  for(;number < halfPoints; number++){
+      
+    cplxValue = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i1i2 format
+    fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2,0,2,0));
+    dVal = _mm_cvtps_pd(fVal); 
+    _mm_store_pd(iBufferPtr, dVal);
+
+    iBufferPtr += 2;
+  }
+
+  number = halfPoints * 2;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_real_64f_aligned16_generic(double* iBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const float* complexVectorPtr = (float*)complexVector;
+  double* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (double)*complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_DEINTERLEAVE_REAL_64F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_dot_prod_aligned16.h b/volk/include/volk/volk_32fc_dot_prod_aligned16.h
new file mode 100644
index 000000000..1a834dc25
--- /dev/null
+++ b/volk/include/volk/volk_32fc_dot_prod_aligned16.h
@@ -0,0 +1,468 @@
+#ifndef INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H
+
+#include <volk/volk_complex.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#if LV_HAVE_GENERIC 
+
+
+static inline void volk_32fc_dot_prod_aligned16_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  float * res = (float*) result;
+  float * in = (float*) input;
+  float * tp = (float*) taps;
+  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+  unsigned int isodd = (num_bytes >> 3) &1;
+  
+  
+  
+  float sum0[2] = {0,0};
+  float sum1[2] = {0,0};
+  int i = 0;
+
+  
+  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
+    
+
+    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+    
+    
+    in += 4;
+    tp += 4;
+
+  }
+
+  
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  
+  
+  
+  for(i = 0; i < isodd; ++i) {
+
+
+    *result += input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1];
+
+  }
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_dot_prod_aligned16_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  asm 
+    (
+     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+     "#                         const float *taps, unsigned num_bytes)\n\t"
+     "#    float sum0 = 0;\n\t"
+     "#    float sum1 = 0;\n\t"
+     "#    float sum2 = 0;\n\t"
+     "#    float sum3 = 0;\n\t"
+     "#    do {\n\t"
+     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+     "#      input += 4;\n\t"
+     "#      taps += 4;  \n\t"
+     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+     "#    result[0] = sum0 + sum2;\n\t"
+     "#    result[1] = sum1 + sum3;\n\t"
+     "# TODO: prefetch and better scheduling\n\t"
+     "  xor    %%r9,  %%r9\n\t"
+     "  xor    %%r10, %%r10\n\t"
+     "  movq   %%rcx, %%rax\n\t"
+     "  movq   %%rcx, %%r8\n\t"
+     "  movq   %[rsi],  %%r9\n\t"
+     "  movq   %[rdx], %%r10\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%r9), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%r10), %%xmm2\n\t"
+     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
+     "  shr     $4, %%r8\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%r9), %%xmmA\n\t"
+     "#	movaps	(%%r10), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%r9), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%r10), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%r9), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	add	$32, %%r9\n\t"
+     "	movaps	32(%%r10), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	add	$32, %%r10\n\t"
+     ".%=L1_test:\n\t"
+     "	dec	%%rax\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	and	$1, %%r8\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "	mov	$0x80000000, %%r9\n\t"
+     "	movd	%%r9, %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
+     :
+     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
+     :"rax", "r8", "r9", "r10"
+     );
+  
+  
+  int getem = num_bytes % 16;
+  
+  
+  for(; getem > 0; getem -= 8) {
+  
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+  
+  }
+
+  return;
+  
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_dot_prod_aligned16_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+  asm volatile 
+    (
+     "	#pushl	%%ebp\n\t"
+     "	#movl	%%esp, %%ebp\n\t"
+     "	movl	12(%%ebp), %%eax		# input\n\t"
+     "	movl	16(%%ebp), %%edx		# taps\n\t"
+     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
+     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
+     "	movaps	0(%%eax), %%xmm0\n\t"
+     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
+     "	movaps	0(%%edx), %%xmm2\n\t"
+     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
+     "	jmp	.%=L1_test\n\t"
+     "	# 4 taps / loop\n\t"
+     "	# something like ?? cycles / loop\n\t"
+     ".%=Loop1:	\n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#	movaps	(%%eax), %%xmmA\n\t"
+     "#	movaps	(%%edx), %%xmmB\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
+     "#	mulps	%%xmmB, %%xmmA\n\t"
+     "#	mulps	%%xmmZ, %%xmmB\n\t"
+     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#	xorps	%%xmmPN, %%xmmA\n\t"
+     "#	movaps	%%xmmA, %%xmmZ\n\t"
+     "#	unpcklps %%xmmB, %%xmmA\n\t"
+     "#	unpckhps %%xmmB, %%xmmZ\n\t"
+     "#	movaps	%%xmmZ, %%xmmY\n\t"
+     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
+     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
+     "#	addps	%%xmmZ, %%xmmA\n\t"
+     "#	addps	%%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     "	movaps	16(%%eax), %%xmm1\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	movaps	16(%%edx), %%xmm3\n\t"
+     "	movaps	%%xmm1, %%xmm5\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm3, %%xmm1\n\t"
+     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
+     "	addps	%%xmm1, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	movaps	32(%%eax), %%xmm0\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     "	mulps	%%xmm5, %%xmm3\n\t"
+     "	addl	$32, %%eax\n\t"
+     "	movaps	32(%%edx), %%xmm2\n\t"
+     "	addps	%%xmm3, %%xmm7\n\t"
+     "	addl	$32, %%edx\n\t"
+     ".%=L1_test:\n\t"
+     "	decl	%%ecx\n\t"
+     "	jge	.%=Loop1\n\t"
+     "	# We've handled the bulk of multiplies up to here.\n\t"
+     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     "	# If so, we've got 2 more taps to do.\n\t"
+     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %%ecx\n\t"
+     "	andl	$1, %%ecx\n\t"
+     "	je	.%=Leven\n\t"
+     "	# The count was odd, do 2 more taps.\n\t"
+     "	# Note that we've already got mm0/mm2 preloaded\n\t"
+     "	# from the main loop.\n\t"
+     "	movaps	%%xmm0, %%xmm4\n\t"
+     "	mulps	%%xmm2, %%xmm0\n\t"
+     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
+     "	addps	%%xmm0, %%xmm6\n\t"
+     "	mulps	%%xmm4, %%xmm2\n\t"
+     "	addps	%%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     "	# neg inversor\n\t"
+     "  movl 8(%%ebp), %%eax \n\t"
+     "	xorps	%%xmm1, %%xmm1\n\t"
+     "  movl	$0x80000000, (%%eax)\n\t"
+     "	movss	(%%eax), %%xmm1\n\t"
+     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
+     "	# pfpnacc\n\t"
+     "	xorps	%%xmm1, %%xmm6\n\t"
+     "	movaps	%%xmm6, %%xmm2\n\t"
+     "	unpcklps %%xmm7, %%xmm6\n\t"
+     "	unpckhps %%xmm7, %%xmm2\n\t"
+     "	movaps	%%xmm2, %%xmm3\n\t"
+     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
+     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
+     "	addps	%%xmm2, %%xmm6\n\t"
+     "					# xmm6 = r1 i2 r3 i4\n\t"
+     "	#movl	8(%%ebp), %%eax		# @result\n\t"
+     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
+     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
+     "	#popl	%%ebp\n\t"
+     :
+     :
+     : "eax", "ecx", "edx"
+     );
+
+  
+  int getem = num_bytes % 16;
+  
+  for(; getem > 0; getem -= 8) {
+    
+    
+    *result += (input[(num_bytes >> 3) - 1] * taps[(num_bytes >> 3) - 1]);
+    
+  }
+  
+  return;
+  
+  
+  
+
+  
+  
+}
+
+#endif /*LV_HAVE_SSE*/  
+
+#if LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_dot_prod_aligned16_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  
+
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int halfPoints = num_bytes >> 4;
+
+  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm_setzero_ps();
+
+  for(;number < halfPoints; number++){
+      
+    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+    a += 2;
+    b += 2;
+  }
+
+  lv_32fc_t dotProductVector[2] __attribute__((aligned(16)));
+
+  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
+
+  if((num_bytes >> 2) != 0) {
+    dotProduct += (*a) * (*b);
+  }
+
+  *result = dotProduct;
+}  
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32fc_dot_prod_aligned16_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
+  volk_32fc_dot_prod_aligned16_sse3(result, input, taps, num_bytes);
+  // SSE3 version runs twice as fast as the SSE4.1 version, so turning off SSE4 version for now
+   /* 
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+    float *p_input, *p_taps;
+    __m64 *p_result;
+
+    p_result = (__m64*)result;
+    p_input = (float*)input;
+    p_taps = (float*)taps;
+
+    static const __m128i neg = {0x000000000000000080000000};
+
+    int i = 0;
+  
+    int bound = (num_bytes >> 5);
+    int leftovers = (num_bytes & 24) >> 3;
+
+    real0 = _mm_sub_ps(real0, real0);
+    real1 = _mm_sub_ps(real1, real1);
+    im0 = _mm_sub_ps(im0, im0);
+    im1 = _mm_sub_ps(im1, im1);
+  
+    for(; i < bound; ++i) {
+  
+    
+    xmm0 = _mm_load_ps(p_input);
+    xmm1 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm2 = _mm_load_ps(p_input);
+    xmm3 = _mm_load_ps(p_taps);
+    
+    p_input += 4;
+    p_taps += 4;
+    
+    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+    
+    //imaginary vector from input
+    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+    //real vector from input
+    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+    //imaginary vector from taps
+    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+    //real vector from taps
+    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+    
+    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+    
+    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+    
+    real0 = _mm_add_ps(xmm4, real0);
+    real1 = _mm_add_ps(xmm5, real1);
+    im0 = _mm_add_ps(xmm6, im0);
+    im1 = _mm_add_ps(xmm7, im1);
+    
+    }
+
+
+    
+    
+    real1 = _mm_xor_ps(real1, (__m128)neg);
+    
+  
+    im0 = _mm_add_ps(im0, im1);
+    real0 = _mm_add_ps(real0, real1);
+  
+    im0 = _mm_add_ps(im0, real0);
+  
+    _mm_storel_pi(p_result, im0);
+  
+    for(i = bound * 4; i < (bound * 4) + leftovers; ++i) {
+    
+    *result += input[i] * taps[i];
+    }
+  */
+}  
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#endif /*INCLUDED_VOLK_32fc_DOT_PROD_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_index_max_aligned16.h b/volk/include/volk/volk_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..d77a95f90
--- /dev/null
+++ b/volk/include/volk/volk_32fc_index_max_aligned16.h
@@ -0,0 +1,215 @@
+#ifndef INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <volk/volk_common.h>
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+
+static inline void volk_32fc_index_max_aligned16_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+  
+  
+  
+  union bit128 holderf;
+  union bit128 holderi;
+  float sq_dist = 0.0;
+
+
+
+  
+  union bit128 xmm5, xmm4;
+  __m128 xmm1, xmm2, xmm3;
+  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
+
+  xmm5.int_vec = xmmfive = _mm_setzero_si128();
+  xmm4.int_vec = xmmfour = _mm_setzero_si128();
+  holderf.int_vec = holder0 = _mm_setzero_si128();
+  holderi.int_vec = holder1 = _mm_setzero_si128();
+ 
+  
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+  
+  
+  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
+  xmm9 = xmm8 = _mm_setzero_si128();
+  xmm10 = _mm_set_epi32(4, 4, 4, 4);
+  xmm3 = _mm_setzero_ps();
+;
+  
+  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
+  
+  for(; i < bound; ++i) {
+  
+    xmm1 = _mm_load_ps((float*)src0);
+    xmm2 = _mm_load_ps((float*)&src0[2]);
+    
+
+    src0 += 4;
+  
+  
+    xmm1 = _mm_mul_ps(xmm1, xmm1);
+    xmm2 = _mm_mul_ps(xmm2, xmm2);
+    
+    
+    xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+    xmm3 = _mm_max_ps(xmm1, xmm3);
+  
+    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+    
+        
+    
+    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+    
+    xmm9 = _mm_add_epi32(xmm11,  xmm12);
+
+    xmm8 = _mm_add_epi32(xmm8, xmm10);    
+
+    
+    //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
+
+  }
+  
+  
+  for(i = 0; i < leftovers0; ++i) {
+
+
+    xmm2 = _mm_load_ps((float*)src0);
+    
+    xmm1 = _mm_movelh_ps((__m128)xmm8, (__m128)xmm8);
+    xmm8 = (__m128i)xmm1;
+
+    xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+    src0 += 2;
+
+    xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+    xmm3 = _mm_max_ps(xmm1, xmm3);
+
+    xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
+    
+    
+    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+    
+        
+    
+    xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+    xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+    
+    xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+    xmm8 = _mm_add_epi32(xmm8, xmm10);    
+    //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+  }
+    
+  
+    
+
+  for(i = 0; i < leftovers1; ++i) {
+    //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+    
+
+    sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+    
+    xmm2 = _mm_load1_ps(&sq_dist);
+
+    xmm1 = xmm3;
+    
+    xmm3 = _mm_max_ss(xmm3, xmm2);
+
+    
+        
+    xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+    xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+    
+        
+    xmm8 = _mm_shuffle_epi32(xmm8, 0x00);  
+    
+    xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+    xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+    
+
+    xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+  }
+  
+  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
+
+  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
+
+  _mm_store_ps((float*)&(holderf.f), xmm3);
+  _mm_store_si128(&(holderi.int_vec), xmm9);
+  
+  target[0] = holderi.i[0];
+  sq_dist = holderf.f[0]; 
+  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+
+ 
+  
+  /*
+  float placeholder = 0.0;
+  uint32_t temp0, temp1; 
+  unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+  unsigned int l0 = g0 ^ 1;
+
+  unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+  unsigned int l1 = g1 ^ 1;
+  
+  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1]; 
+  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
+  
+  g0 = (sq_dist > placeholder);
+  l0 = g0 ^ 1;
+  target[0] = g0 * temp0 + l0 * temp1;
+  */
+  
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_index_max_aligned16_generic(unsigned int* target, lv_32fc_t* src0, unsigned int num_bytes) {
+  float sq_dist = 0.0;
+  float max = 0.0;
+  unsigned int index = 0;
+  
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+
+    sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+    
+    index = sq_dist > max ? i : index;
+    max = sq_dist > max ? sq_dist : max;
+    
+    
+  }
+  target[0] = index;
+  
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32FC_INDEX_MAX_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..4e64d8c22
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_16s_aligned16.h
@@ -0,0 +1,146 @@
+#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_magnitude_16s_aligned16_sse3(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+    result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_magnitude_16s_aligned16_sse(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* complexVectorPtr = (const float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+
+  __m128 vScalar = _mm_set_ps1(scalar);
+
+  __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    cplxValue1 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    cplxValue2 = _mm_load_ps(complexVectorPtr);
+    complexVectorPtr += 4;
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+    result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+    result = _mm_sqrt_ps(result);
+
+    result = _mm_mul_ps(result, vScalar);
+
+    _mm_store_ps(floatBuffer, result);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[0]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[1]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[2]);
+    *magnitudeVectorPtr++ = (int16_t)(floatBuffer[3]);
+  }
+
+  number = quarterPoints * 4;
+  magnitudeVectorPtr = &magnitudeVector[number];
+  for(; number < num_points; number++){
+    float val1Real = *complexVectorPtr++;
+    float val1Imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the magnitude of the complexVector, scales the resulting value and stores the results in the magnitudeVector
+  \param complexVector The vector containing the complex input values
+  \param scalar The scale value multiplied to the magnitude of each complex vector
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+*/
+static inline void volk_32fc_magnitude_16s_aligned16_generic(int16_t* magnitudeVector, const lv_32fc_t* complexVector, const float scalar, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  int16_t* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = (int16_t)(sqrtf((real*real) + (imag*imag)) * scalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..7a8fd1ef9
--- /dev/null
+++ b/volk/include/volk/volk_32fc_magnitude_32f_aligned16.h
@@ -0,0 +1,121 @@
+#ifndef INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_aligned16_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      float val1Real = *complexVectorPtr++;
+      float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_aligned16_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+    for(;number < quarterPoints; number++){
+      cplxValue1 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      cplxValue2 = _mm_load_ps(complexVectorPtr);
+      complexVectorPtr += 4;
+
+      // Arrange in i1i2i3i4 format
+      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+      // Arrange in q1q2q3q4 format
+      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+      result = _mm_sqrt_ps(result);
+
+      _mm_store_ps(magnitudeVectorPtr, result);
+      magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+       float val1Real = *complexVectorPtr++;
+       float val1Imag = *complexVectorPtr++;
+      *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Calculates the magnitude of the complexVector and stores the results in the magnitudeVector
+    \param complexVector The vector containing the complex input values
+    \param magnitudeVector The vector containing the real output values
+    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
+  */
+static inline void volk_32fc_magnitude_32f_aligned16_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+  const float* complexVectorPtr = (float*)complexVector;
+  float* magnitudeVectorPtr = magnitudeVector;
+  unsigned int number = 0;
+  for(number = 0; number < num_points; number++){
+    const float real = *complexVectorPtr++;
+    const float imag = *complexVectorPtr++;
+    *magnitudeVectorPtr++ = sqrtf((real*real) + (imag*imag));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_MAGNITUDE_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_multiply_aligned16.h b/volk/include/volk/volk_32fc_multiply_aligned16.h
new file mode 100644
index 000000000..6a1649fdb
--- /dev/null
+++ b/volk/include/volk/volk_32fc_multiply_aligned16.h
@@ -0,0 +1,78 @@
+#ifndef INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_multiply_aligned16_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for(;number < halfPoints; number++){
+      
+      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+      
+      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+      
+      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+      
+      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+      
+      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+      
+      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    
+      _mm_store_ps((float*)c,z); // Store the results back into the C container
+
+      a += 2;
+      b += 2;
+      c += 2;
+    }
+
+    if((num_points % 2) != 0) {
+      *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_multiply_aligned16_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_MULTIPLY_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..52ec0f95b
--- /dev/null
+++ b/volk/include/volk/volk_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,134 @@
+#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_power_spectral_density_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 magScalar = _mm_set_ps1(10.0);
+  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+  __m128 invRBW = _mm_set_ps1(iRBW);
+  
+  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+  __m128 power;
+  __m128 input1, input2;
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the complex values 
+    input1 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    input2 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    
+    // Apply the normalization factor
+    input1 = _mm_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm_mul_ps(input2, input2);
+    
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    power = _mm_hadd_ps(input1, input2);
+
+    // Divide by the rbw
+    power = _mm_mul_ps(power, invRBW);
+
+    // Calculate the natural log power
+    power = logf4(power);
+    
+    // Convert to log10 and multiply by 10.0
+    power = _mm_mul_ps(power, magScalar);
+    
+    // Store the floating point results
+    _mm_store_ps(destPtr, power);
+    
+    destPtr += 4;
+  }
+  
+  number = quarterPoints*4;  
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+    
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+    destPtr++;
+  }
+  
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the log10 power value divided by the RBW for each input point
+  \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param rbw The resolution bandwith of the fft spectrum
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_power_spectral_density_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned int num_points){
+  // Calculate the Power of the complex point
+  const float* inputPtr = (float*)complexFFTInput;
+  float* realFFTDataPointsPtr = logPowerOutput;
+  unsigned int point;
+  const float invRBW = 1.0 / rbw;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+  for(point = 0; point < num_points; point++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *realFFTDataPointsPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * invRBW);
+    
+    realFFTDataPointsPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
new file mode 100644
index 000000000..645629b9d
--- /dev/null
+++ b/volk/include/volk/volk_32fc_power_spectrum_32f_aligned16.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H
+#define INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+
+#if LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#if LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+  \brief Calculates the log10 power value for each input point
+  \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided against all the input values before the power is calculated
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_power_spectrum_32f_aligned16_sse3(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+  const float* inputPtr = (const float*)complexFFTInput;
+  float* destPtr = logPowerOutput;
+  uint64_t number = 0;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+#if LV_HAVE_LIB_SIMDMATH
+  __m128 magScalar = _mm_set_ps1(10.0);
+  magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+  __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+  __m128 power;
+  __m128 input1, input2;
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the complex values 
+    input1 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+    input2 =_mm_load_ps(inputPtr);
+    inputPtr += 4;
+
+    // Apply the normalization factor
+    input1 = _mm_mul_ps(input1, invNormalizationFactor);
+    input2 = _mm_mul_ps(input2, invNormalizationFactor);
+    
+    // Multiply each value by itself
+    // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+    input1 = _mm_mul_ps(input1, input1);
+    // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+    input2 = _mm_mul_ps(input2, input2);
+    
+    // Horizontal add, to add (r*r) + (i*i) for each complex value
+    // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+    power = _mm_hadd_ps(input1, input2);
+    
+    // Calculate the natural log power
+    power = logf4(power);
+    
+    // Convert to log10 and multiply by 10.0
+    power = _mm_mul_ps(power, magScalar);
+    
+    // Store the floating point results
+    _mm_store_ps(destPtr, power);
+    
+    destPtr += 4;
+  }
+  
+  number = quarterPoints*4;  
+#endif /* LV_HAVE_LIB_SIMDMATH */
+  // Calculate the FFT for any remaining points
+
+  for(; number < num_points; number++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *destPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+    
+    destPtr++;
+  }
+  
+}
+#endif /* LV_HAVE_SSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Calculates the log10 power value for each input point
+  \param logPowerOutput The 10.0 * log10(r*r + i*i) for each data point
+  \param complexFFTInput The complex data output from the FFT point
+  \param normalizationFactor This value is divided agains all the input values before the power is calculated
+  \param num_points The number of fft data points
+*/
+static inline void volk_32fc_power_spectrum_32f_aligned16_generic(float* logPowerOutput, const lv_32fc_t* complexFFTInput, const float normalizationFactor, unsigned int num_points){
+  // Calculate the Power of the complex point
+  const float* inputPtr = (float*)complexFFTInput;
+  float* realFFTDataPointsPtr = logPowerOutput;
+  const float iNormalizationFactor = 1.0 / normalizationFactor;
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    const float real = *inputPtr++ * iNormalizationFactor;
+    const float imag = *inputPtr++ * iNormalizationFactor;
+
+    *realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+
+    
+    realFFTDataPointsPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32fc_POWER_SPECTRUM_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32fc_square_dist_aligned16.h b/volk/include/volk/volk_32fc_square_dist_aligned16.h
new file mode 100644
index 000000000..6458ea4dd
--- /dev/null
+++ b/volk/include/volk/volk_32fc_square_dist_aligned16.h
@@ -0,0 +1,112 @@
+#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H
+#define INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_square_dist_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  lv_32fc_t diff;
+  float sq_dist;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+
+  for(; i < bound - 1; ++i) {
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_square_dist_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
new file mode 100644
index 000000000..0fcc86f1e
--- /dev/null
+++ b/volk/include/volk/volk_32fc_square_dist_scalar_mult_aligned16.h
@@ -0,0 +1,126 @@
+#ifndef INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
+#define INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
+
+#include<inttypes.h>
+#include<stdio.h>
+#include<volk/volk_complex.h>
+#include <string.h>
+
+#if LV_HAVE_SSE3
+#include<xmmintrin.h>
+#include<pmmintrin.h>
+
+static inline void volk_32fc_square_dist_scalar_mult_aligned16_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  
+
+  __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+  lv_32fc_t diff;
+  memset(&diff, 0x0, 2*sizeof(float));
+
+  float sq_dist = 0.0;
+  int bound = num_bytes >> 5;
+  int leftovers0 = (num_bytes >> 4) & 1;
+  int leftovers1 = (num_bytes >> 3) & 1;
+  int i = 0;
+  
+  
+  
+  xmm1 = _mm_setzero_ps();
+  xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);  
+  xmm2 = _mm_load_ps((float*)&points[0]);
+  xmm8 = _mm_load1_ps(&scalar);
+  xmm1 = _mm_movelh_ps(xmm1, xmm1);
+  xmm3 = _mm_load_ps((float*)&points[2]);
+  
+  
+  for(; i < bound - 1; ++i) {
+  
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    xmm3 = _mm_load_ps((float*)&points[2]);
+    
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+  }
+  
+  xmm4 = _mm_sub_ps(xmm1, xmm2);
+  xmm5 = _mm_sub_ps(xmm1, xmm3);
+  
+  
+
+  points += 4;
+  xmm6 = _mm_mul_ps(xmm4, xmm4);
+  xmm7 = _mm_mul_ps(xmm5, xmm5);
+    
+  xmm4 = _mm_hadd_ps(xmm6, xmm7);
+  
+  xmm4 = _mm_mul_ps(xmm4, xmm8);
+   
+  _mm_store_ps(target, xmm4);
+  
+  target += 4;
+  
+
+  for(i = 0; i < leftovers0; ++i) {
+    
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    
+    points += 2;
+    
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+    xmm4 = _mm_mul_ps(xmm4, xmm8);
+    
+    _mm_storeh_pi((__m64*)target, xmm4);
+
+    target += 2;
+  }
+
+  for(i = 0; i < leftovers1; ++i) {
+    
+    diff = src0[0] - points[0];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+
+    target[0] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_GENERIC
+static inline void volk_32fc_square_dist_scalar_mult_aligned16_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_bytes) {
+  lv_32fc_t diff;
+  float sq_dist;
+  int i = 0; 
+  
+  for(; i < num_bytes >> 3; ++i) {
+    diff = src0[0] - points[i];
+
+    sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+    
+    target[i] = sq_dist;
+  }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_VOLK_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_32s_and_aligned16.h b/volk/include/volk/volk_32s_and_aligned16.h
new file mode 100644
index 000000000..e9f1e3a43
--- /dev/null
+++ b/volk/include/volk/volk_32s_and_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32s_AND_ALIGNED16_H
+#define INCLUDED_VOLK_32s_AND_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32s_and_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_and_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ands the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors
+  \param bVector One of the vectors
+  \param num_points The number of values in aVector and bVector to be anded together and stored into cVector
+*/
+static inline void volk_32s_and_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32s_AND_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_convert_32f_aligned16.h b/volk/include/volk/volk_32s_convert_32f_aligned16.h
new file mode 100644
index 000000000..a407e68bd
--- /dev/null
+++ b/volk/include/volk/volk_32s_convert_32f_aligned16.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H
+#define INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32s_convert_32f_aligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_store_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_32s_convert_32f_aligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32s_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_convert_32f_unaligned16.h b/volk/include/volk/volk_32s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..ad7d4eb17
--- /dev/null
+++ b/volk/include/volk/volk_32s_convert_32f_unaligned16.h
@@ -0,0 +1,75 @@
+#ifndef INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H
+#define INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32s_convert_32f_unaligned16_sse2(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    
+     float* outputVectorPtr = outputVector;
+     const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for(;number < quarterPoints; number++){
+
+      // Load the 4 values
+      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+      ret = _mm_cvtepi32_ps(inputVal);
+      ret = _mm_mul_ps(ret, invScalar);
+
+      _mm_storeu_ps(outputVectorPtr, ret);
+
+      outputVectorPtr += 4;
+      inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(; number < num_points; number++){
+      outputVector[number] =((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 32 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 32 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_32s_convert_32f_unaligned16_generic(float* outputVector, const int32_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int32_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32s_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_32s_or_aligned16.h b/volk/include/volk/volk_32s_or_aligned16.h
new file mode 100644
index 000000000..f4c427c4d
--- /dev/null
+++ b/volk/include/volk/volk_32s_or_aligned16.h
@@ -0,0 +1,69 @@
+#ifndef INCLUDED_VOLK_32s_OR_ALIGNED16_H
+#define INCLUDED_VOLK_32s_OR_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32s_or_aligned16_sse(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for(;number < quarterPoints; number++){
+      
+      aVal = _mm_load_ps(aPtr); 
+      bVal = _mm_load_ps(bPtr);
+      
+      cVal = _mm_or_ps(aVal, bVal); 
+      
+      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 4;
+      bPtr += 4;
+      cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+      cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Ors the two input vectors and store their results in the third vector
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be ored
+  \param bVector One of the vectors to be ored
+  \param num_points The number of values in aVector and bVector to be ored together and stored into cVector
+*/
+static inline void volk_32s_or_aligned16_generic(int32_t* cVector, const int32_t* aVector, const int32_t* bVector, unsigned int num_points){
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32s_OR_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32u_byteswap_aligned16.h b/volk/include/volk/volk_32u_byteswap_aligned16.h
new file mode 100644
index 000000000..09173a9d5
--- /dev/null
+++ b/volk/include/volk/volk_32u_byteswap_aligned16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H
+#define INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_aligned16_sse2(uint32_t* intsToSwap, unsigned int num_points){
+  unsigned int number = 0;
+
+  uint32_t* inputPtr = intsToSwap;
+  __m128i input, byte1, byte2, byte3, byte4, output;
+  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+  const uint64_t quarterPoints = num_points / 4;
+  for(;number < quarterPoints; number++){
+    // Load the 32t values, increment inputPtr later since we're doing it in-place.
+    input = _mm_load_si128((__m128i*)inputPtr);
+    // Do the four shifts
+    byte1 = _mm_slli_epi32(input, 24);
+    byte2 = _mm_slli_epi32(input, 8);
+    byte3 = _mm_srli_epi32(input, 8);
+    byte4 = _mm_srli_epi32(input, 24);
+    // Or bytes together
+    output = _mm_or_si128(byte1, byte4);
+    byte2 = _mm_and_si128(byte2, byte2mask);
+    output = _mm_or_si128(output, byte2);
+    byte3 = _mm_and_si128(byte3, byte3mask);
+    output = _mm_or_si128(output, byte3);
+    // Store the results
+    _mm_store_si128((__m128i*)inputPtr, output);
+    inputPtr += 4;
+  }
+  
+  // Byteswap any remaining points:
+  number = quarterPoints*4;  
+  for(; number < num_points; number++){
+    uint32_t outputVal = *inputPtr;
+    outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) | ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+    *inputPtr = outputVal;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int32_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_32u_byteswap_aligned16_generic(uint32_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = intsToSwap;
+
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output = *inputPtr;
+    output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) | ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+    
+    *inputPtr = output;
+    inputPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_32u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_32u_popcnt_aligned16.h b/volk/include/volk/volk_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..37cfd112c
--- /dev/null
+++ b/volk/include/volk/volk_32u_popcnt_aligned16.h
@@ -0,0 +1,36 @@
+#ifndef INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H
+#define INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_32u_popcnt_aligned16_generic(uint32_t* ret, const uint32_t value) {
+
+  // This is faster than a lookup table
+  uint32_t retVal = value;
+
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+  *ret = retVal;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void volk_32u_popcnt_aligned16_sse4_2(uint32_t* ret, const uint32_t value) {
+  *ret = _mm_popcnt_u32(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_32u_POPCNT_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_64f_convert_32f_aligned16.h b/volk/include/volk/volk_64f_convert_32f_aligned16.h
new file mode 100644
index 000000000..44df66104
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_aligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H
+#define INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_aligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_load_pd(inputVectorPtr); inputVectorPtr += 2;
+ 
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_store_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_aligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_64f_CONVERT_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64f_convert_32f_unaligned16.h b/volk/include/volk/volk_64f_convert_32f_unaligned16.h
new file mode 100644
index 000000000..08cfb6127
--- /dev/null
+++ b/volk/include/volk/volk_64f_convert_32f_unaligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H
+#define INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+  /*!
+    \brief Converts the double values into float values
+    \param dVector The converted float vector values
+    \param fVector The double vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_64f_convert_32f_unaligned16_sse2(float* outputVector, const double* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+    
+  const double* inputVectorPtr = (const double*)inputVector;
+  float* outputVectorPtr = outputVector;
+  __m128 ret, ret2;
+  __m128d inputVal1, inputVal2;
+
+  for(;number < quarterPoints; number++){
+    inputVal1 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+    inputVal2 = _mm_loadu_pd(inputVectorPtr); inputVectorPtr += 2;
+ 
+    ret = _mm_cvtpd_ps(inputVal1);
+    ret2 = _mm_cvtpd_ps(inputVal2);
+
+    ret = _mm_movelh_ps(ret, ret2);
+
+    _mm_storeu_ps(outputVectorPtr, ret);
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;    
+  for(; number < num_points; number++){
+    outputVector[number] = (float)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Converts the double values into float values
+  \param dVector The converted float vector values
+  \param fVector The double vector values to be converted
+  \param num_points The number of points in the two vectors to be converted
+*/
+static inline void volk_64f_convert_32f_unaligned16_generic(float* outputVector, const double* inputVector, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const double* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_64f_CONVERT_32f_UNALIGNED16_H */
diff --git a/volk/include/volk/volk_64f_max_aligned16.h b/volk/include/volk/volk_64f_max_aligned16.h
new file mode 100644
index 000000000..ce4907a8c
--- /dev/null
+++ b/volk/include/volk/volk_64f_max_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_64f_MAX_ALIGNED16_H
+#define INCLUDED_VOLK_64f_MAX_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_max_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_max_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects maximum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_max_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_64f_MAX_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64f_min_aligned16.h b/volk/include/volk/volk_64f_min_aligned16.h
new file mode 100644
index 000000000..acf4d6b2a
--- /dev/null
+++ b/volk/include/volk/volk_64f_min_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_64f_MIN_ALIGNED16_H
+#define INCLUDED_VOLK_64f_MIN_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_min_aligned16_sse2(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+
+    __m128d aVal, bVal, cVal;
+    for(;number < halfPoints; number++){
+      
+      aVal = _mm_load_pd(aPtr); 
+      bVal = _mm_load_pd(bPtr);
+      
+      cVal = _mm_min_pd(aVal, bVal); 
+      
+      _mm_store_pd(cPtr,cVal); // Store the results back into the C container
+
+      aPtr += 2;
+      bPtr += 2;
+      cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for(;number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Selects minimum value from each entry between bVector and aVector and store their results in the cVector
+  \param cVector The vector where the results will be stored
+  \param aVector The vector to be checked
+  \param bVector The vector to be checked
+  \param num_points The number of values in aVector and bVector to be checked and stored into cVector
+*/
+static inline void volk_64f_min_aligned16_generic(double* cVector, const double* aVector, const double* bVector, unsigned int num_points){
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr=  bVector;
+    unsigned int number = 0;
+
+    for(number = 0; number < num_points; number++){
+      const double a = *aPtr++;
+      const double b = *bPtr++;
+      *cPtr++ = ( a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_64f_MIN_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64u_byteswap_aligned16.h b/volk/include/volk/volk_64u_byteswap_aligned16.h
new file mode 100644
index 000000000..d5e1b6f30
--- /dev/null
+++ b/volk/include/volk/volk_64u_byteswap_aligned16.h
@@ -0,0 +1,88 @@
+#ifndef INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H
+#define INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_aligned16_sse2(uint64_t* intsToSwap, unsigned int num_points){
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+    uint64_t number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    for(;number < halfPoints; number++){
+      // Load the 32t values, increment inputPtr later since we're doing it in-place.
+      input = _mm_load_si128((__m128i*)inputPtr);
+
+      // Do the four shifts
+      byte1 = _mm_slli_epi32(input, 24);
+      byte2 = _mm_slli_epi32(input, 8);
+      byte3 = _mm_srli_epi32(input, 8);
+      byte4 = _mm_srli_epi32(input, 24);
+      // Or bytes together
+      output = _mm_or_si128(byte1, byte4);
+      byte2 = _mm_and_si128(byte2, byte2mask);
+      output = _mm_or_si128(output, byte2);
+      byte3 = _mm_and_si128(byte3, byte3mask);
+      output = _mm_or_si128(output, byte3);
+      
+      // Reorder the two words
+      output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+      // Store the results
+      _mm_store_si128((__m128i*)inputPtr, output);
+      inputPtr += 4;
+    }
+  
+    // Byteswap any remaining points:
+    number = halfPoints*2;  
+    for(; number < num_points; number++){
+      uint32_t output1 = *inputPtr;
+      uint32_t output2 = inputPtr[1];
+      
+      output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+      
+      output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+      
+      *inputPtr++ = output2;
+      *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Byteswaps (in-place) an aligned vector of int64_t's.
+  \param intsToSwap The vector of data to byte swap
+  \param numDataPoints The number of data points
+*/
+static inline void volk_64u_byteswap_aligned16_generic(uint64_t* intsToSwap, unsigned int num_points){
+  uint32_t* inputPtr = (uint32_t*)intsToSwap;
+  unsigned int point;
+  for(point = 0; point < num_points; point++){
+    uint32_t output1 = *inputPtr;
+    uint32_t output2 = inputPtr[1];
+    
+    output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+    
+    output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+    
+    *inputPtr++ = output2;
+    *inputPtr++ = output1;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_64u_BYTESWAP_ALIGNED16_H */
diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..e8e1396e7
--- /dev/null
+++ b/volk/include/volk/volk_64u_popcnt_aligned16.h
@@ -0,0 +1,74 @@
+#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H
+#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H
+
+#include <stdio.h>
+#include <inttypes.h>
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) {
+
+  const uint32_t* valueVector = (const uint32_t*)&value;
+  
+  // This is faster than a lookup table
+  uint32_t retVal = valueVector[0];
+
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+  uint64_t retVal64  = retVal;
+
+  retVal = valueVector[1];
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+  retVal64 += retVal;
+
+  *ret = retVal64;
+
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) {
+#if LV_64
+  *ret = _mm_popcnt_u64(value);
+#else
+  const uint32_t* valueVector = (const uint32_t*)&value;
+  
+  // This is faster than a lookup table
+  uint32_t retVal = valueVector[0];
+
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+  uint64_t retVal64  = retVal;
+
+  retVal = valueVector[1];
+  retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+  retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+  retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+  retVal = (retVal + (retVal >> 8));
+  retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+  retVal64 += retVal;
+
+  *ret = retVal64;
+
+#endif
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/
diff --git a/volk/include/volk/volk_8s_convert_16s_aligned16.h b/volk/include/volk/volk_8s_convert_16s_aligned16.h
new file mode 100644
index 000000000..0efe3c6a1
--- /dev/null
+++ b/volk/include/volk/volk_8s_convert_16s_aligned16.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H
+#define INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_convert_16s_aligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_store_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_convert_16s_aligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_16s_unaligned16.h b/volk/include/volk/volk_8s_convert_16s_unaligned16.h
new file mode 100644
index 000000000..05b916cea
--- /dev/null
+++ b/volk/include/volk/volk_8s_convert_16s_unaligned16.h
@@ -0,0 +1,73 @@
+#ifndef INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H
+#define INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8s_convert_16s_unaligned16_sse4_1(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128(inputVectorPtr);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVal = _mm_srli_si128(inputVal, 8);
+      ret = _mm_cvtepi8_epi16(inputVal);
+      ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+      _mm_storeu_si128(outputVectorPtr, ret);
+
+      outputVectorPtr++;
+
+      inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (int16_t)(inputVector[number])*256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into 16 bit integer data
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The 16 bit output data buffer
+    \param num_points The number of data values to be converted
+    \note Input and output buffers do NOT need to be properly aligned
+  */
+static inline void volk_8s_convert_16s_unaligned16_generic(int16_t* outputVector, const int8_t* inputVector, unsigned int num_points){
+  int16_t* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_32f_aligned16.h b/volk/include/volk/volk_8s_convert_32f_aligned16.h
new file mode 100644
index 000000000..54b66ef8f
--- /dev/null
+++ b/volk/include/volk/volk_8s_convert_32f_aligned16.h
@@ -0,0 +1,92 @@
+#ifndef INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H
+#define INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_convert_32f_aligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_store_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+  */
+static inline void volk_8s_convert_32f_aligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8s_convert_32f_unaligned16.h b/volk/include/volk/volk_8s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..8019aac9a
--- /dev/null
+++ b/volk/include/volk/volk_8s_convert_32f_unaligned16.h
@@ -0,0 +1,94 @@
+#ifndef INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H
+#define INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8s_convert_32f_unaligned16_sse4_1(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1( iScalar );
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for(;number < sixteenthPoints; number++){
+      inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVal = _mm_srli_si128(inputVal, 4);
+      interimVal = _mm_cvtepi8_epi32(inputVal);
+      ret = _mm_cvtepi32_ps(interimVal);
+      ret = _mm_mul_ps(ret, invScalar);
+      _mm_storeu_ps(outputVectorPtr, ret);
+      outputVectorPtr += 4;
+
+      inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for(; number < num_points; number++){
+      outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+  /*!
+    \brief Converts the input 8 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
+    \param inputVector The 8 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_8s_convert_32f_unaligned16_generic(float* outputVector, const int8_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputVectorPtr = outputVector;
+  const int8_t* inputVectorPtr = inputVector;
+  unsigned int number = 0;
+  const float iScalar = 1.0 / scalar;
+
+  for(number = 0; number < num_points; number++){
+    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..38eaa49ea
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,77 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_16s_aligned16_sse4_1(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+  __m128i complexVal, iOutputVal, qOutputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
+    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+
+    qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+    qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_16s_aligned16_generic(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  unsigned int number;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+    *qBufferPtr++ = (int16_t)(*complexVectorPtr++)*256;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..d0c118965
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,164 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_32f_aligned16_sse4_1(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue, qFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+    iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+    _mm_store_ps(iBufferPtr, iFloatValue);
+    iBufferPtr += 4;
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+    qBufferPtr += 4;
+
+    qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+    qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+    qFloatValue = _mm_cvtepi32_ps(qIntVal);
+    qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+    _mm_store_ps(qBufferPtr, qFloatValue);
+
+    qBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_32f_aligned16_sse(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[8] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(complexVectorPtr[0]);
+    floatBuffer[1] = (float)(complexVectorPtr[1]);
+    floatBuffer[2] = (float)(complexVectorPtr[2]);
+    floatBuffer[3] = (float)(complexVectorPtr[3]);
+      
+    floatBuffer[4] = (float)(complexVectorPtr[4]);
+    floatBuffer[5] = (float)(complexVectorPtr[5]);
+    floatBuffer[6] = (float)(complexVectorPtr[6]);
+    floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+    cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+    cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+    complexVectorPtr += 8;
+
+    cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+    cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+    _mm_store_ps(iBufferPtr, iValue);
+    _mm_store_ps(qBufferPtr, qValue);
+
+    iBufferPtr += 4;
+    qBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  complexVectorPtr = (int8_t*)&complexVector[number];
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q floating point vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_32f_aligned16_generic(float* iBuffer, float* qBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  float* qBufferPtr = qBuffer;
+  unsigned int number;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+    *qBufferPtr++ = (float)(*complexVectorPtr++)*invScalar;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..d0cb49494
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,66 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_16s_aligned16_sse4_1(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i complexVal, outputVal;
+
+  unsigned int eighthPoints = num_points / 8;
+
+  for(number = 0; number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    outputVal = _mm_cvtepi8_epi16(complexVal);
+    outputVal = _mm_slli_epi16(outputVal, 7);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_16s_aligned16_generic(int16_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_16s_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..c849448ea
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,133 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_32f_aligned16_sse4_1(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;    
+  __m128 iFloatValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  __m128i complexVal, iIntVal;
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+  for(;number < eighthPoints; number++){
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr += 16;
+    complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+
+    complexVal = _mm_srli_si128(complexVal, 4);
+    iIntVal = _mm_cvtepi8_epi32(complexVal);
+    iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+    iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iFloatValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = eighthPoints * 8;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#if LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_32f_aligned16_sse(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  float* iBufferPtr = iBuffer;
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;    
+  __m128 iValue;
+
+  const float iScalar= 1.0 / scalar;
+  __m128 invScalar = _mm_set_ps1(iScalar);
+  int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+  float floatBuffer[4] __attribute__((aligned(128)));
+
+  for(;number < quarterPoints; number++){
+    floatBuffer[0] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[1] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[2] = (float)(*complexVectorPtr); complexVectorPtr += 2;
+    floatBuffer[3] = (float)(*complexVectorPtr); complexVectorPtr += 2; 
+
+    iValue = _mm_load_ps(floatBuffer);
+
+    iValue = _mm_mul_ps(iValue, invScalar);
+
+    _mm_store_ps(iBufferPtr, iValue);
+
+    iBufferPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    complexVectorPtr++;
+  }
+    
+}
+#endif /* LV_HAVE_SSE */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I float vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param scalar The scaling value being multiplied against each data point
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_32f_aligned16_generic(float* iBuffer, const lv_8sc_t* complexVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+  float* iBufferPtr = iBuffer;
+  const float invScalar = 1.0 / scalar;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_32f_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..d84d64568
--- /dev/null
+++ b/volk/include/volk/volk_8sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,67 @@
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_8s_aligned16_ssse3(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i moveMask1 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i moveMask2 = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m128i complexVal1, complexVal2, outputVal;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+    complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr += 16;
+
+    complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+    complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+    outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+    _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8sc_deinterleave_real_8s_aligned16_generic(int8_t* iBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  for(number = 0; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
new file mode 100644
index 000000000..470a67539
--- /dev/null
+++ b/volk/include/volk/volk_8sc_multiply_conjugate_16sc_aligned16.h
@@ -0,0 +1,102 @@
+#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H
+#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_multiply_conjugate_16sc_aligned16_sse4_1(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  lv_16sc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+    
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+      
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+      
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    _mm_store_si128((__m128i*)c, _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz), _mm_unpackhi_epi32(realz, imagz)));
+
+    a += 4;
+    b += 4;
+    c += 4;
+  }
+    
+  number = quarterPoints * 4;
+  int16_t* c16Ptr = (int16_t*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_multiply_conjugate_16sc_aligned16_generic(lv_16sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
+  unsigned int number = 0;
+  int16_t* c16Ptr = (int16_t*)cVector;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number =0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+
+    *c16Ptr++ = (int16_t)lv_creal(temp);
+    *c16Ptr++ = (int16_t)lv_cimag(temp);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_16sc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
new file mode 100644
index 000000000..52b444cf7
--- /dev/null
+++ b/volk/include/volk/volk_8sc_multiply_conjugate_32fc_aligned16.h
@@ -0,0 +1,122 @@
+#ifndef INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H
+#define INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_multiply_conjugate_32fc_aligned16_sse4_1(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m128i x, y, realz, imagz;
+  __m128 ret;
+  lv_32fc_t* c = cVector;
+  const lv_8sc_t* a = aVector;
+  const lv_8sc_t* b = bVector;
+  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+  const int shuffleMask = _MM_SHUFFLE(2,3,0,1);
+  __m128 invScalar = _mm_set_ps1(1.0/scalar);
+
+  for(;number < quarterPoints; number++){
+    // Convert into 8 bit values into 16 bit values
+    x = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)a));
+    y = _mm_cvtepi8_epi16(_mm_movpi64_epi64(*(__m64*)b));
+
+    // Calculate the ar*cr - ai*(-ci) portions
+    realz = _mm_madd_epi16(x,y);
+
+    // Calculate the complex conjugate of the cr + ci j values
+    y = _mm_sign_epi16(y, conjugateSign);
+
+    // Shift the order of the cr and ci values
+    y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, shuffleMask ), shuffleMask);
+
+    // Calculate the ar*(-ci) + cr*(ai)
+    imagz = _mm_madd_epi16(x,y);
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    // Interleave real and imaginary and then convert to float values
+    ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+    // Normalize the floating point values
+    ret = _mm_mul_ps(ret, invScalar);
+
+    // Store the floating point values
+    _mm_store_ps((float*)c, ret);
+    c += 2;
+
+    a += 4;
+    b += 4;
+  }
+
+  number = quarterPoints * 4;
+  float* cFloatPtr = (float*)&cVector[number];
+  int8_t* a8Ptr = (int8_t*)&aVector[number];
+  int8_t* b8Ptr = (int8_t*)&bVector[number];
+  for(; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cFloatPtr++ = lv_creal(temp) / scalar;
+    *cFloatPtr++ = lv_cimag(temp) / scalar;
+  }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#if LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex vector and stores their results in the third vector
+  \param cVector The complex vector where the results will be stored
+  \param aVector One of the complex vectors to be multiplied
+  \param bVector The complex vector which will be converted to complex conjugate and multiplied
+  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8sc_multiply_conjugate_32fc_aligned16_generic(lv_32fc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  float* cPtr = (float*)cVector;
+  const float invScalar = 1.0 / scalar;
+  int8_t* a8Ptr = (int8_t*)aVector;
+  int8_t* b8Ptr = (int8_t*)bVector;
+  for(number = 0; number < num_points; number++){
+    float aReal =  (float)*a8Ptr++;
+    float aImag =  (float)*a8Ptr++;
+    lv_32fc_t aVal = lv_32fc_init(aReal, aImag );
+    float bReal = (float)*b8Ptr++;
+    float bImag = (float)*b8Ptr++;
+    lv_32fc_t bVal = lv_32fc_init( bReal, -bImag );
+    lv_32fc_t temp = aVal * bVal;
+    
+    *cPtr++ = (lv_creal(temp) * invScalar);
+    *cPtr++ = (lv_cimag(temp) * invScalar);
+  }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+
+
+#endif /* INCLUDED_VOLK_8sc_MULTIPLY_CONJUGATE_32fc_ALIGNED16_H */
diff --git a/volk/include/volk/volk_common.h b/volk/include/volk/volk_common.h
new file mode 100644
index 000000000..6f444ad89
--- /dev/null
+++ b/volk/include/volk/volk_common.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_LIBVECTOR_COMMON_H
+#define INCLUDED_LIBVECTOR_COMMON_H
+
+#include<inttypes.h>
+#if LV_HAVE_MMX
+#include<xmmintrin.h>
+union bit128{
+  uint16_t i16[8];
+  uint32_t i[4];
+  float f[4];
+  double d[2];
+  __m128i int_vec;
+  __m128 float_vec;
+  __m128d double_vec;
+};
+#endif /*LV_HAVE_MMX*/
+
+#endif /*INCLUDED_LIBVECTOR_COMMON_H*/
diff --git a/volk/include/volk/volk_complex.h b/volk/include/volk/volk_complex.h
new file mode 100644
index 000000000..b20b5cf47
--- /dev/null
+++ b/volk/include/volk/volk_complex.h
@@ -0,0 +1,71 @@
+#ifndef INCLUDE_VOLK_COMPLEX_H
+#define INCLUDE_VOLK_COMPLEX_H
+
+/*!
+  \brief This header file is to prevent issues with having <complex> and <complex.h> variables in the same code as the gcc compiler does not allow that
+*/
+#ifdef __cplusplus
+
+#include <complex>
+#include <stdint.h>
+
+typedef std::complex<int8_t>  lv_8sc_t;
+typedef std::complex<int16_t> lv_16sc_t;
+typedef std::complex<int32_t> lv_32sc_t;
+typedef std::complex<float>   lv_32fc_t;
+typedef std::complex<double>  lv_64fc_t;
+
+static inline float lv_creal(const lv_32fc_t x){
+  return x.real();
+}
+
+static inline float lv_cimag(const lv_32fc_t x){
+  return x.imag();
+}
+
+static inline lv_32fc_t lv_conj(const lv_32fc_t x){
+  return std::conj(x);
+}
+
+static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){
+  return std::pow(x, y);
+}
+
+static inline lv_32fc_t lv_32fc_init(const float x, const float y){
+  return std::complex<float>(x,y);
+}
+
+#else
+
+#include <complex.h>
+
+typedef char complex         lv_8sc_t;
+typedef short complex        lv_16sc_t;
+typedef int complex          lv_32sc_t;
+typedef float complex        lv_32fc_t;
+typedef double complex       lv_64fc_t;
+
+static inline float lv_creal(const lv_32fc_t x){
+  return creal(x);
+}
+
+static inline float lv_cimag(const lv_32fc_t x){
+  return cimag(x);
+}
+
+static inline lv_32fc_t lv_conj(const lv_32fc_t x){
+  return conj(x);
+}
+
+static inline lv_32fc_t lv_cpow(const lv_32fc_t x, const lv_32fc_t y){
+  return cpow(x, y);
+}
+
+static inline lv_32fc_t lv_32fc_init(const float x, const float y){
+  return x + I*y;
+}
+
+#endif
+
+
+#endif /* INCLUDE_VOLK_COMPLEX_H */
diff --git a/volk/include/volk/volk_regexp.py b/volk/include/volk/volk_regexp.py
new file mode 100644
index 000000000..7b695cb3b
--- /dev/null
+++ b/volk/include/volk/volk_regexp.py
@@ -0,0 +1,8 @@
+import re
+
+remove_after_underscore = re.compile("_.*");
+space_remove = re.compile(" ");
+leading_space_remove = re.compile("^ *");
+replace_arch = re.compile(", const char\* arch");
+replace_bracket = re.compile(" {");
+replace_volk = re.compile("volk");
diff --git a/volk/include/volk/volk_register.py b/volk/include/volk/volk_register.py
new file mode 100755
index 000000000..d00501a93
--- /dev/null
+++ b/volk/include/volk/volk_register.py
@@ -0,0 +1,279 @@
+#! /usr/bin/python
+
+import sys
+import re
+import string
+from xml.dom import minidom
+from volk_regexp import *
+from make_cpuid_x86_c import make_cpuid_x86_c
+from make_cpuid_h import make_cpuid_h
+from make_proccpu_sim import make_proccpu_sim
+from make_set_simd import make_set_simd
+from make_cpuid_generic_c import make_cpuid_generic_c
+from make_cpuid_powerpc_c import make_cpuid_powerpc_c
+from make_registry import make_registry
+from make_h import make_h
+from make_init_h import make_init_h
+from make_config_fixed import make_config_fixed
+from make_config_in import make_config_in
+from make_c import make_c
+from make_runtime_c import make_runtime_c
+from make_init_c import make_init_c
+from make_runtime import make_runtime
+from make_typedefs import make_typedefs
+from make_mktables import make_mktables
+from make_environment_init_c import make_environment_init_c
+from make_environment_init_h import make_environment_init_h
+
+outfile_set_simd = open("../../config/lv_set_simd_flags.m4", "w");
+outfile_reg = open("volk_registry.h", "w");
+outfile_h = open("volk.h", "w");
+outfile_c = open("../../lib/volk.c", "w");
+outfile_runtime = open("volk_runtime.h", "w");
+outfile_runtime_c = open("../../lib/volk_runtime.c", "w");
+outfile_typedefs = open("volk_typedefs.h", "w");
+outfile_init_h = open("../../lib/volk_init.h", "w");
+outfile_init_c = open("../../lib/volk_init.c", "w");
+outfile_cpu_h = open("volk_cpu.h", "w");
+outfile_cpu_x86_c = open("../../lib/volk_cpu_x86.c", "w");
+outfile_cpu_generic_c = open("../../lib/volk_cpu_generic.c", "w");
+outfile_cpu_powerpc_c = open("../../lib/volk_cpu_powerpc.c", "w");
+outfile_proccpu_sim = open("../../lib/volk_proccpu_sim.c", "w");
+outfile_config_in = open("../../volk_config.h.in", "w");
+outfile_config_fixed = open("volk_config_fixed.h", "w");
+outfile_mktables = open("../../lib/volk_mktables.c", "w");
+outfile_environment_c = open("../../lib/volk_environment_init.c", "w");
+outfile_environment_h = open("volk_environment_init.h", "w");
+infile = open("Makefile.am", "r");
+
+
+mfile = infile.readlines();
+
+datatypes = [];
+functions = [];
+
+
+
+for line in mfile:
+    subline = re.search(".*(aligned).*", line);
+    if subline:
+        subsubline = re.search("(?<=volk_).*", subline.group(0));
+        if subsubline:
+            dtype = remove_after_underscore.sub("", subsubline.group(0));
+            subdtype = re.search("[0-9]+[A-z]+", dtype);
+            if subdtype:
+                datatypes.append(subdtype.group(0));
+                
+
+datatypes = set(datatypes);
+
+for line in mfile:
+    for dt in datatypes:
+        if dt in line:
+            subline = re.search("(volk_" + dt +"_.*(aligned).*\.h)", line);
+            if subline:
+                
+                subsubline = re.search(".+(?=\.h)", subline.group(0));
+                
+                functions.append(subsubline.group(0));
+
+archs = [];
+afile = minidom.parse("archs.xml");
+filearchs = afile.getElementsByTagName("arch");
+for filearch in filearchs:
+    archs.append(str(filearch.attributes["name"].value));
+                 
+for arch in archs:
+    a_var = re.search("^\$", arch);
+    if a_var:
+        archs.remove(arch);
+
+
+
+archs_or = "("
+for arch in archs:
+    archs_or = archs_or + string.upper(arch) + "|";
+archs_or = archs_or[0:len(archs_or)-1];
+archs_or = archs_or + ")";
+
+    
+taglist = [];
+fcountlist = [];
+arched_arglist = [];
+retlist = [];
+my_arglist = [];
+my_argtypelist = [];
+for func in functions:
+    tags = [];
+    fcount = [];
+    infile_source = open(func + ".h");
+    begun_name = 0;
+    begun_paren = 0;
+    sourcefile = infile_source.readlines();
+    infile_source.close();
+    for line in sourcefile:
+
+        archline = re.search("^\#if.*?LV_HAVE_" + archs_or + ".*", line);
+        if archline:
+            arch = archline.group(0);
+            archline = re.findall(archs_or + "(?=( |\n|&))", line);
+            if archline:
+                archsublist = [];
+                for tup in archline:
+                    archsublist.append(tup[0]);
+                fcount.append(archsublist);
+        testline = re.search("static inline.*?" + func, line);
+        if (not testline):
+            continue
+        tagline = re.search(func + "_.+", line);
+        if tagline:
+            tag = re.search("(?<=" + func + "_)\w+(?= *\()",line);
+            if tag:
+                tag = re.search("\w+", tag.group(0));
+                if tag:
+                    tags.append(tag.group(0));
+               
+    
+        if begun_name == 0:
+            retline = re.search(".+(?=" + func + ")", line);
+            if retline:
+                ret = retline.group(0);
+                
+                
+                
+        
+            subline = re.search(func + ".*", line);
+            if subline:
+                subsubline = re.search("\(.*?\)", subline.group(0));
+                if subsubline:
+                    args = subsubline.group(0);
+                    
+                else:
+                    begun_name = 1;
+                    subsubline = re.search("\(.*", subline.group(0));
+                    if subsubline:
+                        args = subsubline.group(0);
+                        begun_paren = 1;
+        else:
+            if begun_paren == 1:
+                subline = re.search(".*?\)", line);
+                if subline:
+                    args = args + subline.group(0);
+                    begun_name = 0;
+                    begun_paren = 0;
+                else:
+                    subline = re.search(".*", line);
+                    args = args + subline.group(0);
+            else:
+                subline = re.search("\(.*?\)", line);
+                if subline:
+                    args = subline.group(0);
+                    begun_name = 0;
+                else: 
+                    subline = re.search("\(.*", line);
+                    if subline:
+                        args = subline.group(0);
+                        begun_paren = 1;
+    
+    replace = re.compile("static ");
+    ret = replace.sub("", ret);
+    replace = re.compile("inline ");
+    ret = replace.sub("", ret);
+    replace = re.compile("\)");
+    arched_args = replace.sub(", const char* arch) {", args);
+    
+    remove = re.compile('\)|\(|{');
+    rargs = remove.sub("", args);
+    sargs = rargs.split(',');
+    
+    
+    
+    margs = [];
+    atypes = [];
+    for arg in sargs:
+        temp = arg.split(" ");
+        margs.append(temp[-1]);
+        replace = re.compile(" " + temp[-1]);
+        atypes.append(replace.sub("", arg));
+
+    
+    my_args = ""
+    arg_types = ""
+    for arg in range(0, len(margs) - 1):
+        this_arg = leading_space_remove.sub("", margs[arg]);
+        my_args = my_args + this_arg + ", ";
+        this_type = leading_space_remove.sub("", atypes[arg]);
+        arg_types = arg_types + this_type + ", ";
+
+    this_arg = leading_space_remove.sub("", margs[-1]);
+    my_args = my_args + this_arg;
+    this_type = leading_space_remove.sub("", atypes[-1]);
+    arg_types = arg_types + this_type;
+    my_argtypelist.append(arg_types);
+    
+    if(ret[-1] != ' '):
+        ret = ret + ' ';
+
+    arched_arglist.append(arched_args); #!!!!!!!!!!!
+    my_arglist.append(my_args) #!!!!!!!!!!!!!!!!!
+    retlist.append(ret);
+    fcountlist.append(fcount);
+    taglist.append(tags);               
+
+outfile_mktables.write(make_mktables(functions));
+outfile_mktables.close();
+
+
+outfile_cpu_h.write(make_cpuid_h(filearchs));
+outfile_cpu_h.close();
+
+outfile_cpu_x86_c.write(make_cpuid_x86_c(filearchs));
+outfile_cpu_x86_c.close();
+
+outfile_proccpu_sim.write(make_proccpu_sim(filearchs));
+outfile_proccpu_sim.close();
+
+outfile_set_simd.write(make_set_simd(filearchs));
+outfile_set_simd.close();
+
+outfile_cpu_generic_c.write(make_cpuid_generic_c(filearchs)); 
+outfile_cpu_generic_c.close();
+
+outfile_cpu_powerpc_c.write(make_cpuid_powerpc_c(filearchs));
+outfile_cpu_powerpc_c.close();
+
+outfile_config_in.write(make_config_in(filearchs));
+outfile_config_in.close();
+    
+outfile_reg.write(make_registry(filearchs, functions, fcountlist));
+outfile_reg.close();
+
+outfile_h.write(make_h(functions, arched_arglist, retlist));
+outfile_h.close();  
+
+outfile_init_h.write(make_init_h(functions, arched_arglist, retlist));
+outfile_init_h.close();
+
+outfile_config_fixed.write(make_config_fixed(filearchs));
+outfile_config_fixed.close();
+
+outfile_c.write( make_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist));
+outfile_c.close();
+
+outfile_runtime_c.write(make_runtime_c(functions, taglist, arched_arglist, retlist, my_arglist, fcountlist));
+outfile_runtime_c.close();
+
+outfile_init_c.write(make_init_c(functions, filearchs));
+outfile_init_c.close();
+
+outfile_runtime.write(make_runtime(functions));
+outfile_runtime.close();
+
+outfile_typedefs.write(make_typedefs(functions, retlist, my_argtypelist));
+outfile_typedefs.close();
+
+outfile_environment_c.write(make_environment_init_c(filearchs));
+outfile_environment_c.close();
+
+outfile_environment_h.write(make_environment_init_h());
+outfile_environment_h.close();
diff --git a/volk/lib/Makefile.am b/volk/lib/Makefile.am
new file mode 100644
index 000000000..97eb75680
--- /dev/null
+++ b/volk/lib/Makefile.am
@@ -0,0 +1,361 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(CPPUNIT_CPPFLAGS) $(LV_CXXFLAGS)
+
+
+# We build 2 libraries and 1 executable here.  One library contains
+# everything except the libcppunit QA code, and one contains only the
+# libcppunit-based QA code.  The C++ QA code is especially recommended
+# when you have general purpose C or C++ code that may not get
+# thoroughly exercised by building and running a GR block.  The
+# executable runs the QA code at "make check" time.
+#
+# N.B., If there's a SWIG generated shared library and associated
+# python code, it will be contained in ../python, not here.  (That
+# code is conditionally built depending on the state of the
+# --without-python configure option.)  However, the .i should be here
+# next to the .h that it's based on.
+
+
+# list of programs run by "make check" and "make distcheck"
+TESTS = test_all
+
+
+lib_LTLIBRARIES = \
+	libvolk.la \
+	libvolk_runtime.la \
+	libvolk_qa.la
+
+
+# ----------------------------------------------------------------
+#                      The main library
+# ----------------------------------------------------------------
+
+universal_runtime_CODE = 	\
+	volk_runtime.c	\
+	volk_init.c \
+	volk_rank_archs.c	
+
+universal_CODE = 		\
+	volk.c 			\
+	volk_environment_init.c
+
+generic_CODE = 		\
+	volk_cpu_generic.cc
+
+x86_CODE = 		\
+	volk_cpu_x86.c
+
+x86_SUBCODE = 		\
+	cpuid_x86.S
+
+x86_64_SUBCODE = 	\
+	cpuid_x86_64.S
+
+powerpc_CODE = \
+	volk_cpu_powerpc.cc
+
+
+if MD_CPU_generic
+libvolk_la_SOURCES =	\
+	$(generic_CODE)		\
+	$(universal_CODE)
+libvolk_runtime_la_SOURCES =	\
+	$(generic_CODE)		\
+	$(universal_runtime_CODE)
+
+endif
+
+if MD_CPU_x86
+if MD_SUBCPU_x86_64
+libvolk_la_SOURCES =	\
+	$(x86_CODE)		\
+	$(x86_64_SUBCODE)		\
+	$(universal_CODE) 		
+
+libvolk_runtime_la_SOURCES =	\
+	$(x86_CODE)		\
+	$(x86_64_SUBCODE)		\
+	$(universal_runtime_CODE) 		
+else
+libvolk_la_SOURCES =	\
+	$(x86_CODE)		\
+	$(x86_SUBCODE)	\
+	$(universal_CODE)
+
+libvolk_runtime_la_SOURCES =	\
+	$(x86_CODE)		\
+	$(x86_SUBCODE)	\
+	$(universal_runtime_CODE)
+endif
+endif
+
+
+if MD_CPU_powerpc
+libvolk_la_SOURCES =	\
+	$(powerpc_CODE)		\
+	$(universal_CODE)
+
+libvolk_runtime_la_SOURCES =	\
+	$(powerpc_CODE)		\
+	$(universal_runtime_CODE)
+endif
+
+
+
+libvolk_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 
+libvolk_runtime_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0
+
+libvolk_la_LIBADD =
+
+
+
+# ----------------------------------------------------------------
+#        The QA library.  Note libvolk.la in LIBADD
+# ----------------------------------------------------------------
+libvolk_qa_la_SOURCES = \
+	qa_volk.cc \
+	qa_16s_quad_max_star_aligned16.cc \
+	qa_32fc_dot_prod_aligned16.cc \
+	qa_32fc_square_dist_aligned16.cc \
+	qa_32fc_square_dist_scalar_mult_aligned16.cc \
+	qa_32f_sum_of_poly_aligned16.cc \
+	qa_32fc_index_max_aligned16.cc \
+	qa_32f_index_max_aligned16.cc \
+	qa_32fc_conjugate_dot_prod_aligned16.cc \
+	qa_16s_permute_and_scalar_add_aligned16.cc \
+	qa_16s_branch_4_state_8_aligned16.cc \
+	qa_16s_max_star_horizontal_aligned16.cc \
+	qa_16s_max_star_aligned16.cc \
+	qa_16s_add_quad_aligned16.cc \
+	qa_32f_add_aligned16.cc \
+	qa_32f_subtract_aligned16.cc \
+	qa_32f_max_aligned16.cc \
+	qa_32f_min_aligned16.cc \
+	qa_64f_max_aligned16.cc \
+	qa_64f_min_aligned16.cc \
+	qa_32s_and_aligned16.cc \
+	qa_32s_or_aligned16.cc \
+	qa_32f_dot_prod_aligned16.cc \
+	qa_32f_dot_prod_unaligned16.cc \
+	qa_32f_fm_detect_aligned16.cc \
+	qa_32fc_32f_multiply_aligned16.cc \
+	qa_32fc_multiply_aligned16.cc \
+	qa_32f_divide_aligned16.cc \
+	qa_32f_multiply_aligned16.cc \
+	qa_32f_sqrt_aligned16.cc \
+	qa_8sc_multiply_conjugate_16sc_aligned16.cc \
+	qa_8sc_multiply_conjugate_32fc_aligned16.cc \
+	qa_32u_popcnt_aligned16.cc \
+	qa_64u_popcnt_aligned16.cc \
+	qa_64u_byteswap_aligned16.cc \
+	qa_8sc_deinterleave_32f_aligned16.cc \
+	qa_16sc_deinterleave_32f_aligned16.cc \
+	qa_8sc_deinterleave_16s_aligned16.cc \
+	qa_32f_interleave_32fc_aligned16.cc \
+	qa_16u_byteswap_aligned16.cc \
+	qa_16sc_deinterleave_16s_aligned16.cc \
+	qa_32fc_deinterleave_real_32f_aligned16.cc \
+	qa_32fc_magnitude_32f_aligned16.cc \
+	qa_32fc_deinterleave_real_64f_aligned16.cc \
+	qa_32fc_deinterleave_real_16s_aligned16.cc \
+	qa_32fc_magnitude_16s_aligned16.cc \
+	qa_32fc_deinterleave_32f_aligned16.cc \
+	qa_8sc_deinterleave_real_8s_aligned16.cc \
+	qa_32fc_deinterleave_64f_aligned16.cc \
+	qa_32f_interleave_16sc_aligned16.cc \
+	qa_16sc_deinterleave_real_8s_aligned16.cc \
+	qa_16sc_deinterleave_real_32f_aligned16.cc \
+	qa_16sc_magnitude_32f_aligned16.cc \
+	qa_32u_byteswap_aligned16.cc \
+	qa_16sc_deinterleave_real_16s_aligned16.cc \
+	qa_8sc_deinterleave_real_32f_aligned16.cc \
+	qa_16sc_magnitude_16s_aligned16.cc \
+	qa_32f_normalize_aligned16.cc \
+	qa_8sc_deinterleave_real_16s_aligned16.cc \
+	qa_16s_convert_32f_aligned16.cc \
+	qa_16s_convert_32f_unaligned16.cc \
+	qa_16s_convert_8s_aligned16.cc \
+	qa_16s_convert_8s_unaligned16.cc \
+	qa_32f_convert_16s_aligned16.cc \
+	qa_32f_convert_16s_unaligned16.cc \
+	qa_32f_convert_32s_aligned16.cc \
+	qa_32f_convert_32s_unaligned16.cc \
+	qa_32f_convert_64f_aligned16.cc \
+	qa_32f_convert_64f_unaligned16.cc \
+	qa_32f_convert_8s_aligned16.cc \
+	qa_32f_convert_8s_unaligned16.cc \
+	qa_32s_convert_32f_aligned16.cc \
+	qa_32s_convert_32f_unaligned16.cc \
+	qa_64f_convert_32f_aligned16.cc \
+	qa_64f_convert_32f_unaligned16.cc \
+	qa_8s_convert_16s_aligned16.cc \
+	qa_8s_convert_16s_unaligned16.cc \
+	qa_8s_convert_32f_aligned16.cc \
+	qa_8s_convert_32f_unaligned16.cc \
+	qa_32fc_32f_power_32fc_aligned16.cc \
+	qa_32f_power_aligned16.cc \
+	qa_32fc_atan2_32f_aligned16.cc \
+	qa_32fc_power_spectral_density_32f_aligned16.cc \
+	qa_32fc_power_spectrum_32f_aligned16.cc \
+	qa_32f_calc_spectral_noise_floor_aligned16.cc \
+	qa_32f_accumulator_aligned16.cc \
+	qa_32f_stddev_aligned16.cc \
+	qa_32f_stddev_and_mean_aligned16.cc
+
+libvolk_qa_la_LDFLAGS = $(NO_UNDEFINED) -version-info 0:0:0 
+
+libvolk_qa_la_LIBADD = \
+	libvolk.la \
+	libvolk_runtime.la \
+	$(CPPUNIT_LIBS)
+
+# ----------------------------------------------------------------
+# headers that don't get installed
+# ----------------------------------------------------------------
+noinst_HEADERS = \
+	volk_init.h \
+	qa_volk.h \
+	qa_16s_quad_max_star_aligned16.h \
+	qa_32fc_dot_prod_aligned16.h \
+	qa_32fc_square_dist_aligned16.h \
+	qa_32fc_square_dist_scalar_mult_aligned16.h \
+	qa_32f_sum_of_poly_aligned16.h \
+	qa_32fc_index_max_aligned16.h \
+	qa_32f_index_max_aligned16.h \
+	qa_32fc_conjugate_dot_prod_aligned16.h \
+	qa_16s_permute_and_scalar_add_aligned16.h \
+	qa_16s_branch_4_state_8_aligned16.h \
+	qa_16s_max_star_horizontal_aligned16.h \
+	qa_16s_max_star_aligned16.h \
+	qa_16s_add_quad_aligned16.h \
+	qa_32f_add_aligned16.h \
+	qa_32f_subtract_aligned16.h \
+	qa_32f_max_aligned16.h \
+	qa_32f_min_aligned16.h \
+	qa_64f_max_aligned16.h \
+	qa_64f_min_aligned16.h \
+	qa_32s_and_aligned16.h \
+	qa_32s_or_aligned16.h \
+	qa_32f_dot_prod_aligned16.h \
+	qa_32f_dot_prod_unaligned16.h \
+	qa_32f_fm_detect_aligned16.h \
+	qa_32fc_32f_multiply_aligned16.h \
+	qa_32fc_multiply_aligned16.h \
+	qa_32f_divide_aligned16.h \
+	qa_32f_multiply_aligned16.h \
+	qa_32f_sqrt_aligned16.h \
+	qa_8sc_multiply_conjugate_16sc_aligned16.h \
+	qa_8sc_multiply_conjugate_32fc_aligned16.h \
+	qa_32u_popcnt_aligned16.h \
+	qa_64u_popcnt_aligned16.h \
+	qa_64u_byteswap_aligned16.h \
+	qa_8sc_deinterleave_32f_aligned16.h \
+	qa_16sc_deinterleave_32f_aligned16.h \
+	qa_8sc_deinterleave_16s_aligned16.h \
+	qa_32f_interleave_32fc_aligned16.h \
+	qa_16u_byteswap_aligned16.h \
+	qa_16sc_deinterleave_16s_aligned16.h \
+	qa_32fc_deinterleave_real_32f_aligned16.h \
+	qa_32fc_magnitude_32f_aligned16.h \
+	qa_32fc_deinterleave_real_64f_aligned16.h \
+	qa_32fc_deinterleave_real_16s_aligned16.h \
+	qa_32fc_magnitude_16s_aligned16.h \
+	qa_32fc_deinterleave_32f_aligned16.h \
+	qa_8sc_deinterleave_real_8s_aligned16.h \
+	qa_32fc_deinterleave_64f_aligned16.h \
+	qa_32f_interleave_16sc_aligned16.h \
+	qa_16sc_deinterleave_real_8s_aligned16.h \
+	qa_16sc_deinterleave_real_32f_aligned16.h \
+	qa_16sc_magnitude_32f_aligned16.h \
+	qa_32u_byteswap_aligned16.h \
+	qa_16sc_deinterleave_real_16s_aligned16.h \
+	qa_8sc_deinterleave_real_32f_aligned16.h \
+	qa_16sc_magnitude_16s_aligned16.h \
+	qa_32f_normalize_aligned16.h \
+	qa_8sc_deinterleave_real_16s_aligned16.h \
+	qa_16s_convert_32f_aligned16.h \
+	qa_16s_convert_32f_unaligned16.h \
+	qa_16s_convert_8s_aligned16.h \
+	qa_16s_convert_8s_unaligned16.h \
+	qa_32f_convert_16s_aligned16.h \
+	qa_32f_convert_16s_unaligned16.h \
+	qa_32f_convert_32s_aligned16.h \
+	qa_32f_convert_32s_unaligned16.h \
+	qa_32f_convert_64f_aligned16.h \
+	qa_32f_convert_64f_unaligned16.h \
+	qa_32f_convert_8s_aligned16.h \
+	qa_32f_convert_8s_unaligned16.h \
+	qa_32s_convert_32f_aligned16.h \
+	qa_32s_convert_32f_unaligned16.h \
+	qa_64f_convert_32f_aligned16.h \
+	qa_64f_convert_32f_unaligned16.h \
+	qa_8s_convert_16s_aligned16.h \
+	qa_8s_convert_16s_unaligned16.h \
+	qa_8s_convert_32f_aligned16.h \
+	qa_8s_convert_32f_unaligned16.h \
+	qa_32fc_32f_power_32fc_aligned16.h \
+	qa_32f_power_aligned16.h \
+	qa_32fc_atan2_32f_aligned16.h \
+	qa_32fc_power_spectral_density_32f_aligned16.h \
+	qa_32fc_power_spectrum_32f_aligned16.h \
+	qa_32f_calc_spectral_noise_floor_aligned16.h \
+	qa_32f_accumulator_aligned16.h \
+	qa_32f_stddev_aligned16.h \
+	qa_32f_stddev_and_mean_aligned16.h
+
+
+# ----------------------------------------------------------------
+# Our test program
+# ----------------------------------------------------------------
+noinst_PROGRAMS = \
+	test_all
+
+test_all_SOURCES = test_all.cc
+test_all_LDADD   = libvolk_qa.la
+
+
+distclean-local: 
+	rm -f volk.c
+	rm -f volk_cpu_generic.c
+	rm -f volk_cpu_powerpc.c
+	rm -f volk_cpu_x86.c
+	rm -f volk_init.c
+	rm -f volk_init.h
+	rm -f volk_mktables
+	rm -f volk_mktables.c
+	rm -f volk_proccpu_sim.c
+	rm -f volk_runtime.c
+	rm -f volk_tables.h
+	rm -f volk_environment_init.c
+#SUBDIRS = 
+
+#ifdef BUILD_SSE
+#SUBDIRS += sse
+#elif BUILD_SPU
+#SUBDIRS += spu
+#else
+#SUBDIRS += port
+#endif
+
+
diff --git a/volk/lib/assembly.h b/volk/lib/assembly.h
new file mode 100644
index 000000000..8a99aa07c
--- /dev/null
+++ b/volk/lib/assembly.h
@@ -0,0 +1,67 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2002 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _ASSEMBLY_H_
+#define _ASSEMBLY_H_
+
+#if defined (__APPLE__) && defined (__APPLE_CC__)
+
+// XCode ignores the .scl and .type functions in XCode 2.2.1 and 2.3,
+// but creates an error in XCode 2.4.  Just ignore them.
+
+#define GLOB_SYMB(f)    _ ## f
+
+#define DEF_FUNC_HEAD(f)  /* none */
+
+#define FUNC_TAIL(f)    /* none*/
+
+#elif !defined (__ELF__)
+
+/*
+ * Too bad, the following define does not work as expected --SF
+ * 	#define GLOB_SYMB(f)	__USER_LABEL_PREFIX__ ## f
+ */
+#define GLOB_SYMB(f)	_ ## f
+
+#define DEF_FUNC_HEAD(f)	\
+	.def	GLOB_SYMB(f); .scl 2; .type 32; .endef
+
+#define FUNC_TAIL(f)	/* none */
+
+
+#else	/* !__ELF__ */
+
+
+#define GLOB_SYMB(f)	f
+
+#define DEF_FUNC_HEAD(f)	\
+	.type	GLOB_SYMB(f),@function	\
+
+#define FUNC_TAIL(f)	\
+  .Lfe1:		\
+	.size	GLOB_SYMB(f),.Lfe1-GLOB_SYMB(f)
+
+
+#endif	/* !__ELF__ */
+
+
+#endif /* _ASSEMBLY_H_ */
diff --git a/volk/lib/cpuid_x86.S b/volk/lib/cpuid_x86.S
new file mode 100644
index 000000000..4e1a9404f
--- /dev/null
+++ b/volk/lib/cpuid_x86.S
@@ -0,0 +1,60 @@
+#	
+# Copyright 2003 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+# 
+
+#
+# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result
+#
+#  void cpuid_x86 (unsigned int op, unsigned int result[4]);
+#
+
+#include "assembly.h"
+
+.file "cpuid_x86.S"
+	.version	"01.01"
+.text
+.globl	GLOB_SYMB(cpuid_x86)
+	DEF_FUNC_HEAD(cpuid_x86)
+GLOB_SYMB(cpuid_x86):
+	pushl	%ebp
+	movl	%esp, %ebp
+	pushl	%ebx		# must save in PIC mode, holds GOT pointer
+	pushl	%esi
+	
+	movl	8(%ebp), %eax	# op
+	movl	12(%ebp), %esi	# result
+	cpuid
+	movl	%eax, 0(%esi)
+	movl	%ebx, 4(%esi)
+	movl	%ecx, 8(%esi)
+	movl	%edx, 12(%esi)
+	
+	popl	%esi
+	popl	%ebx
+	popl	%ebp
+	ret
+
+FUNC_TAIL(cpuid_x86)
+	.ident	"Hand coded cpuid assembly"
+	
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/volk/lib/cpuid_x86_64.S b/volk/lib/cpuid_x86_64.S
new file mode 100644
index 000000000..32b1847cd
--- /dev/null
+++ b/volk/lib/cpuid_x86_64.S
@@ -0,0 +1,54 @@
+#	
+# Copyright 2003,2005 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+# 
+
+#
+# execute CPUID instruction, return EAX, EBX, ECX and EDX values in result
+#
+#  void cpuid_x86 (unsigned int op, unsigned int result[4]);
+#
+
+#include "assembly.h"
+
+.file "cpuid_x86_64.S"
+	.version	"01.01"
+.text
+.globl	GLOB_SYMB(cpuid_x86)
+	DEF_FUNC_HEAD(cpuid_x86)
+GLOB_SYMB(cpuid_x86):
+	mov	%rbx, %r11	# must save in PIC mode, holds GOT pointer
+	
+	mov	%rdi, %rax	# op
+	cpuid
+	movl	%eax, 0(%rsi)	# result
+	movl	%ebx, 4(%rsi)
+	movl	%ecx, 8(%rsi)
+	movl	%edx, 12(%rsi)
+	
+	mov	%r11, %rbx
+	retq
+
+FUNC_TAIL(cpuid_x86)
+	.ident	"Hand coded cpuid64 assembly"
+	
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.cc b/volk/lib/qa_16s_add_quad_aligned16.cc
new file mode 100644
index 000000000..c3005c1be
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <volk/volk_16s_add_quad_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_add_quad_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_add_quad_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3200;
+  const int ITERS = 100000;
+  short input0[vlen] __attribute__ ((aligned (16)));
+  short input1[vlen] __attribute__ ((aligned (16)));
+  short input2[vlen] __attribute__ ((aligned (16)));
+  short input3[vlen] __attribute__ ((aligned (16)));
+  short input4[vlen] __attribute__ ((aligned (16)));
+  
+  short output0[vlen] __attribute__ ((aligned (16)));
+  short output1[vlen] __attribute__ ((aligned (16)));
+  short output2[vlen] __attribute__ ((aligned (16)));
+  short output3[vlen] __attribute__ ((aligned (16)));
+  short output01[vlen] __attribute__ ((aligned (16)));
+  short output11[vlen] __attribute__ ((aligned (16)));
+  short output21[vlen] __attribute__ ((aligned (16)));
+  short output31[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {
+    short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    
+    input0[i] = plus0 - minus0;
+    input1[i] = plus1 - minus1;
+    input2[i] = plus2 - minus2;
+    input3[i] = plus3 - minus3;
+    input4[i] = plus4 - minus4;
+    
+  }
+  printf("16s_add_quad_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
+    CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
+    CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_add_quad_aligned16.h b/volk/lib/qa_16s_add_quad_aligned16.h
new file mode 100644
index 000000000..3c1ae978b
--- /dev/null
+++ b/volk/lib/qa_16s_add_quad_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.cc b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
new file mode 100644
index 000000000..ba5e8ed93
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.cc
@@ -0,0 +1,106 @@
+#include <volk/volk.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_branch_4_state_8_aligned16::t1() {
+  const int num_iters = 1000000;
+  const int vlen = 32;
+
+  static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
+  static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
+  static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
+  static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
+  static char* permuters[4] = {permute0, permute1, permute2, permute3};
+  
+  unsigned int num_bytes = vlen << 1;
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  
+  short target[vlen] __attribute__ ((aligned (16)));
+  short target2[vlen] __attribute__ ((aligned (16)));
+  short target3[vlen] __attribute__ ((aligned (16)));
+  
+  short src0[vlen] __attribute__ ((aligned (16)));
+  short permute_indexes[vlen] __attribute__ ((aligned (16))) =  {
+7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
+  short cntl0[vlen] __attribute__ ((aligned (16))) = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+  short cntl1[vlen] __attribute__ ((aligned (16))) = {
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
+  short cntl2[vlen] __attribute__ ((aligned (16))) = {
+    0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
+  short cntl3[vlen] __attribute__ ((aligned (16))) =  {
+    0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
+  short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+  
+  
+
+  for(int i = 0; i < vlen; ++i) {
+    src0[i] = i;
+    
+  }
+  
+
+  printf("16s_branch_4_state_8_aligned\n");
+  
+  
+  start = clock();
+  for(int i = 0; i < num_iters; ++i) {
+    volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+  }
+  end = clock();
+  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("permute_and_scalar_add_time: %f\n", total);
+  
+  
+
+  start = clock();
+  for(int i = 0; i < num_iters; ++i) {
+    volk_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
+  }
+  end = clock();
+
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("branch_4_state_8_time, ssse3: %f\n", total);
+  
+  start = clock();
+  for(int i = 0; i < num_iters; ++i) {
+    volk_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
+  }
+  end = clock();
+  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("permute_and_scalar_add_time, generic: %f\n", total);
+  
+  
+  
+  for(int i = 0; i < vlen; ++i) {
+    printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    
+    CPPUNIT_ASSERT(target[i] == target2[i]);
+    CPPUNIT_ASSERT(target[i] == target3[i]);
+  }
+}
+
+
+#endif
diff --git a/volk/lib/qa_16s_branch_4_state_8_aligned16.h b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
new file mode 100644
index 000000000..41ab073e0
--- /dev/null
+++ b/volk/lib/qa_16s_branch_4_state_8_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_32f_aligned16.cc b/volk/lib/qa_16s_convert_32f_aligned16.cc
new file mode 100644
index 000000000..7878d4737
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_aligned16.cc
@@ -0,0 +1,73 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16s_convert_32f_aligned16.h>
+#include <volk/volk_16s_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE
+
+void qa_16s_convert_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_32f_aligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int16_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+  }
+  printf("16s_convert_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_16s_convert_32f_aligned16(output_sse4_1, input0, 32768.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_32f_aligned16.h b/volk/lib/qa_16s_convert_32f_aligned16.h
new file mode 100644
index 000000000..ef813d96f
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_convert_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.cc b/volk/lib/qa_16s_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..8c3121e5c
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_unaligned16.cc
@@ -0,0 +1,73 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16s_convert_32f_unaligned16.h>
+#include <volk/volk_16s_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE
+
+void qa_16s_convert_32f_unaligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_32f_unaligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int16_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+  }
+  printf("16s_convert_32f_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_32f_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_16s_convert_32f_unaligned16(output_sse4_1, input0, 32768.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_32f_unaligned16.h b/volk/lib/qa_16s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..aeb04f770
--- /dev/null
+++ b/volk/lib/qa_16s_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_convert_32f_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_8s_aligned16.cc b/volk/lib/qa_16s_convert_8s_aligned16.cc
new file mode 100644
index 000000000..734b7784e
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_convert_8s_aligned16.h>
+#include <volk/volk_16s_convert_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_convert_8s_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_8s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int16_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+  }
+  printf("16s_convert_8s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_8s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_8s_aligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d -> %d...%d\n", input0[i], output_generic[i], output_sse2[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_8s_aligned16.h b/volk/lib/qa_16s_convert_8s_aligned16.h
new file mode 100644
index 000000000..2e409d0cc
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_8s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_convert_8s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.cc b/volk/lib/qa_16s_convert_8s_unaligned16.cc
new file mode 100644
index 000000000..275ab7668
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16s_convert_8s_unaligned16.h>
+#include <volk/volk_16s_convert_8s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_convert_8s_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_convert_8s_unaligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int16_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+  }
+  printf("16s_convert_8s_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_8s_unaligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_convert_8s_unaligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_convert_8s_unaligned16.h b/volk/lib/qa_16s_convert_8s_unaligned16.h
new file mode 100644
index 000000000..4b2fe9e42
--- /dev/null
+++ b/volk/lib/qa_16s_convert_8s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H
+#define INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_convert_8s_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_convert_8s_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_CONVERT_8S_UNALIGNED16_H */
diff --git a/volk/lib/qa_16s_max_star_aligned16.cc b/volk/lib/qa_16s_max_star_aligned16.cc
new file mode 100644
index 000000000..b46b9ae8e
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_aligned16.cc
@@ -0,0 +1,65 @@
+#include <volk/volk.h>
+#include <qa_16s_max_star_aligned16.h>
+#include <volk/volk_16s_max_star_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_max_star_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+
+
+void qa_16s_max_star_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 6400;
+  const int ITERS = 100000;
+  short input0[vlen] __attribute__ ((aligned (16)));
+  short output0[1] __attribute__ ((aligned (16)));
+
+  short output1[1] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {
+    short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+
+    short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
+    
+    input0[i] = plus0 - minus0;
+    
+  }
+  printf("16s_max_star_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_max_star_aligned16_manual(output0, input0, vlen << 1, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_max_star_aligned16_manual(output1, input0, vlen << 1, "ssse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("ssse3_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < 1; ++i) {
+    
+    CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_max_star_aligned16.h b/volk/lib/qa_16s_max_star_aligned16.h
new file mode 100644
index 000000000..119f87c4d
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_max_star_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_max_star_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.cc b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc
new file mode 100644
index 000000000..4d44735df
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.cc
@@ -0,0 +1,79 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16s_max_star_horizontal_aligned16.h>
+#include <volk/volk_16s_max_star_horizontal_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+//test for ssse3
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16s_max_star_horizontal_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_16s_max_star_horizontal_aligned16::t1() {
+
+  
+  volk_runtime_init();
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 32;
+  const int ITERS = 1;
+  short input0[vlen] __attribute__ ((aligned (16)));
+  short output0[vlen>>1] __attribute__ ((aligned (16)));
+
+  short output1[vlen>>1] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {
+    short plus0 = ((short) (rand() - (RAND_MAX/2)));
+    
+    short minus0 = ((short) (rand() - (RAND_MAX/2)));
+    
+    input0[i] = plus0 - minus0;
+    
+  }
+  printf("16s_max_star_horizontal_aligned\n");
+  
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16s_max_star_horizontal_aligned16_manual(output0, input0, 2*vlen, "generic");
+    volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen, "generic");
+    volk_16s_max_star_horizontal_aligned16_manual(output0, output0, vlen/2, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+
+    get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen);
+    get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen);
+    get_volk_runtime()->volk_16s_max_star_horizontal_aligned16(output1, output1, vlen);
+    /*    volk_16s_max_star_horizontal_aligned16(output1, input0, 2*vlen, "ssse3");
+    volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");
+    volk_16s_max_star_horizontal_aligned16(output1, output1, vlen, "ssse3");*/
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("ssse3_time: %f\n", total);
+  
+  for(int i = 0; i < (vlen >> 1); ++i) {
+    //    printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+    
+  }
+  for(int i = 0; i < (vlen >> 1); ++i) {
+      
+      CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+    }
+	}
+   
+  
+#endif
+	
diff --git a/volk/lib/qa_16s_max_star_horizontal_aligned16.h b/volk/lib/qa_16s_max_star_horizontal_aligned16.h
new file mode 100644
index 000000000..9f9757253
--- /dev/null
+++ b/volk/lib/qa_16s_max_star_horizontal_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H
+#define INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_max_star_horizontal_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_max_star_horizontal_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_MAX_STAR_HORIZONTAL_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
new file mode 100644
index 000000000..3c4f5c6cc
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.cc
@@ -0,0 +1,78 @@
+#include <volk/volk.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <volk/volk_16s_permute_and_scalar_add_aligned16.h>
+#include <cstdlib>
+#include <time.h>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_permute_and_scalar_add_aligned16::t1() {
+  const int vlen = 64;
+  
+  unsigned int num_bytes = vlen << 1;
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  
+  short target[vlen] __attribute__ ((aligned (16)));
+  short target2[vlen] __attribute__ ((aligned (16)));
+  short src0[vlen] __attribute__ ((aligned (16)));
+  short permute_indexes[vlen] __attribute__ ((aligned (16)));
+  short cntl0[vlen] __attribute__ ((aligned (16)));
+  short cntl1[vlen] __attribute__ ((aligned (16)));
+  short cntl2[vlen] __attribute__ ((aligned (16)));
+  short cntl3[vlen] __attribute__ ((aligned (16)));
+  short scalars[4] __attribute__ ((aligned (16))) = {1, 2, 3, 4};
+
+  for(int i = 0; i < vlen; ++i) {
+    src0[i] = i;
+    permute_indexes[i] = (3 * i)%vlen;
+    cntl0[i] = 0xff;
+    cntl1[i] = 0xff * (i%2);
+    cntl2[i] = 0xff * ((i>>1)%2);
+    cntl3[i] = 0xff * ((i%4) == 3);
+  }
+
+  printf("16s_permute_and_scalar_add_aligned\n");
+  
+  start = clock();
+  for(int i = 0; i < 100000; ++i) {
+    volk_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
+  }
+  end = clock();
+
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("generic_time: %f\n", total);
+  
+  start = clock();
+  for(int i = 0; i < 100000; ++i) {
+    volk_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
+  }
+  end = clock();
+  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+
+  printf("sse2_time: %f\n", total);
+  
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("generic... %d, sse2... %d\n", target[i], target2[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    
+    CPPUNIT_ASSERT(target[i] == target2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
new file mode 100644
index 000000000..3643aeef6
--- /dev/null
+++ b/volk/lib/qa_16s_permute_and_scalar_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.cc b/volk/lib/qa_16s_quad_max_star_aligned16.cc
new file mode 100644
index 000000000..80a220c93
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <volk/volk_16s_quad_max_star_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16s_quad_max_star_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16s_quad_max_star_aligned16::t1() {
+  const int vlen = 34;
+  
+  short input0[vlen] __attribute__ ((aligned (16)));
+  short input1[vlen] __attribute__ ((aligned (16)));
+  short input2[vlen] __attribute__ ((aligned (16)));
+  short input3[vlen] __attribute__ ((aligned (16)));
+
+  short output0[vlen] __attribute__ ((aligned (16)));
+  short output1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {
+    short plus0 = (short) (rand() - (RAND_MAX/2));
+    short plus1 = (short) (rand() - (RAND_MAX/2));
+    short plus2 = (short) (rand() - (RAND_MAX/2));
+    short plus3 = (short) (rand() - (RAND_MAX/2));
+
+    short minus0 = (short) (rand() - (RAND_MAX/2));
+    short minus1 = (short) (rand() - (RAND_MAX/2));
+    short minus2 = (short) (rand() - (RAND_MAX/2));
+    short minus3 = (short) (rand() - (RAND_MAX/2));
+
+    input0[i] = plus0 - minus0;
+    input1[i] = plus1 - minus1;
+    input2[i] = plus2 - minus2;
+    input3[i] = plus3 - minus3;
+  }
+
+  volk_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
+
+  volk_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
+
+  printf("16s_quad_max_star_aligned\n");
+  for(int i = 0; i < vlen; ++i) {
+    printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    
+    CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16s_quad_max_star_aligned16.h b/volk/lib/qa_16s_quad_max_star_aligned16.h
new file mode 100644
index 000000000..51e77081a
--- /dev/null
+++ b/volk/lib/qa_16s_quad_max_star_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
new file mode 100644
index 000000000..e700ac72c
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.cc
@@ -0,0 +1,76 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_16s_aligned16.h>
+#include <volk/volk_16sc_deinterleave_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16sc_deinterleave_16s_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_16s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_generic1[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse21[vlen] __attribute__ ((aligned (16)));
+  int16_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+  int16_t output_ssse31[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32678.0));
+  }
+  printf("16sc_deinterleave_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_16s_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_16s_aligned16_manual(output_ssse3, output_ssse31, input0, vlen, "ssse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("ssse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_sse2[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_sse21[i]);
+
+    CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_ssse3[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_ssse31[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..995ab5b34
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
new file mode 100644
index 000000000..6ee076998
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_32f_aligned16.h>
+#include <volk/volk_16sc_deinterleave_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16sc_deinterleave_32f_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_generic1[vlen] __attribute__ ((aligned (16)));
+  float output_sse2[vlen] __attribute__ ((aligned (16)));
+  float output_sse21[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0));
+  }
+  printf("16sc_deinterleave_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_32f_aligned16_manual(output_sse2, output_sse21, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i],  output_sse21[i], fabs(output_generic1[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..fea3b6c2d
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc
new file mode 100644
index 000000000..ca048ea67
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.cc
@@ -0,0 +1,71 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_real_16s_aligned16.h>
+#include <volk/volk_16sc_deinterleave_real_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16sc_deinterleave_real_16s_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_real_16s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+  int16_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] = ((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32678.0));
+  }
+  printf("16sc_deinterleave_real_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_16s_aligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_16s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("ssse3_time: %f\n", total);
+
+  for(int i = 0; i < vlen; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    // printf("%d = generic... %d, sse2... %d, ssse3... %d\n", i, output_generic[i], output_sse2[i], output_ssse3[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_ssse3[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..ebb70b97a
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc
new file mode 100644
index 000000000..0f4ba6923
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.cc
@@ -0,0 +1,123 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_real_32f_aligned16.h>
+#include <volk/volk_16sc_deinterleave_real_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifndef LV_HAVE_SSE
+
+void qa_16sc_deinterleave_real_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_real_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0));
+  }
+  printf("16sc_deinterleave_real_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif /* SSE */
+
+#else
+
+void qa_16sc_deinterleave_real_32f_aligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0);
+  }
+  printf("16sc_deinterleave_real_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_16sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 32768.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif /* SSE4_1 */
diff --git a/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..e83426473
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
new file mode 100644
index 000000000..5ab458bc9
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16sc_deinterleave_real_8s_aligned16.h>
+#include <volk/volk_16sc_deinterleave_real_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_16sc_deinterleave_real_8s_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_deinterleave_real_8s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int8_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0);
+  }
+  printf("16sc_deinterleave_real_8s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("ssse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..04e5511e5
--- /dev/null
+++ b/volk/lib/qa_16sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+#define INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_deinterleave_real_8s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.cc b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
new file mode 100644
index 000000000..b14610757
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_16sc_magnitude_16s_aligned16.h>
+#include <volk/volk_16sc_magnitude_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_16sc_magnitude_16s_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_magnitude_16s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* loadInput = (int16_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((int16_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 32768.0));
+  }
+  printf("16sc_magnitude_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_16s_aligned16_manual(output_sse, input0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_16s_aligned16_manual(output_sse3, input0, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_magnitude_16s_aligned16.h b/volk/lib/qa_16sc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..4664b70f4
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H
+#define INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_magnitude_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_magnitude_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_MAGNITUDE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.cc b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
new file mode 100644
index 000000000..06dff2fd5
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_16sc_magnitude_32f_aligned16.h>
+#include <volk/volk_16sc_magnitude_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_16sc_magnitude_32f_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_16sc_magnitude_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int16_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  int16_t* inputLoad = (int16_t*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (int16_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("16sc_magnitude_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16sc_magnitude_32f_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16sc_magnitude_32f_aligned16.h b/volk/lib/qa_16sc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..0c25673ea
--- /dev/null
+++ b/volk/lib/qa_16sc_magnitude_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H
+#define INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16sc_magnitude_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16sc_magnitude_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16SC_MAGNITUDE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_16u_byteswap_aligned16.cc b/volk/lib/qa_16u_byteswap_aligned16.cc
new file mode 100644
index 000000000..6b19828a4
--- /dev/null
+++ b/volk/lib/qa_16u_byteswap_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_16u_byteswap_aligned16.h>
+#include <volk/volk_16u_byteswap_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_16u_byteswap_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_16u_byteswap_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100001;
+  
+  uint16_t output0[vlen] __attribute__ ((aligned (16)));
+  uint16_t output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    output0[i] = (uint16_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
+  }
+  memcpy(output01, output0, vlen*sizeof(uint16_t));
+
+  printf("16u_byteswap_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16u_byteswap_aligned16_manual(output0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_16u_byteswap_aligned16_manual(output01, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_16u_byteswap_aligned16.h b/volk/lib/qa_16u_byteswap_aligned16.h
new file mode 100644
index 000000000..e11b23e3f
--- /dev/null
+++ b/volk/lib/qa_16u_byteswap_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H
+#define INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_16u_byteswap_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_16u_byteswap_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_16U_BYTESWAP_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_accumulator_aligned16.cc b/volk/lib/qa_32f_accumulator_aligned16.cc
new file mode 100644
index 000000000..ea637d600
--- /dev/null
+++ b/volk/lib/qa_32f_accumulator_aligned16.cc
@@ -0,0 +1,56 @@
+#include <volk/volk.h>
+#include <qa_32f_accumulator_aligned16.h>
+#include <volk/volk_32f_accumulator_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_accumulator_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_accumulator_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  float accumulator_generic;
+  float accumulator_sse;
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_accumulator_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_accumulator_aligned16_manual(&accumulator_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_accumulator_aligned16_manual(&accumulator_sse, input0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  //printf("%d...%d\n", output0[i], output01[i]);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(accumulator_generic, accumulator_sse, fabs(accumulator_generic)*1e-4);
+}
+
+#endif
diff --git a/volk/lib/qa_32f_accumulator_aligned16.h b/volk/lib/qa_32f_accumulator_aligned16.h
new file mode 100644
index 000000000..0004d3ff0
--- /dev/null
+++ b/volk/lib/qa_32f_accumulator_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H
+#define INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_accumulator_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_accumulator_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_ACCUMULATOR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_add_aligned16.cc b/volk/lib/qa_32f_add_aligned16.cc
new file mode 100644
index 000000000..92f35c7ec
--- /dev/null
+++ b/volk/lib/qa_32f_add_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_add_aligned16.h>
+#include <volk/volk_32f_add_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_add_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_add_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_add_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_add_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_add_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_add_aligned16.h b/volk/lib/qa_32f_add_aligned16.h
new file mode 100644
index 000000000..58e2a151c
--- /dev/null
+++ b/volk/lib/qa_32f_add_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_ADD_ALIGNED16_H
+#define INCLUDED_QA_32F_ADD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_add_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_add_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_ADD_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc
new file mode 100644
index 000000000..3c8137004
--- /dev/null
+++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_32f_calc_spectral_noise_floor_aligned16.h>
+#include <volk/volk_32f_calc_spectral_noise_floor_aligned16.h>
+#include <cstdlib>
+#include <math.h>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_calc_spectral_noise_floor_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_calc_spectral_noise_floor_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[1] __attribute__ ((aligned (16)));
+  float output01[1] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_calc_spectral_noise_floor_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_calc_spectral_noise_floor_aligned16_manual(output0, input0, 20, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_calc_spectral_noise_floor_aligned16_manual(output01, input0, 20, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < 1; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h
new file mode 100644
index 000000000..c5dce2c4b
--- /dev/null
+++ b/volk/lib/qa_32f_calc_spectral_noise_floor_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+#define INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_calc_spectral_noise_floor_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_calc_spectral_noise_floor_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CALC_SPECTRAL_NOISE_FLOOR_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_16s_aligned16.cc b/volk/lib/qa_32f_convert_16s_aligned16.cc
new file mode 100644
index 000000000..84a4c40c4
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_16s_aligned16.h>
+#include <volk/volk_32f_convert_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_16s_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_16s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_16s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < vlen; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("%d generic... %d, sse... %d sse2... %d\n", i, output_generic[i], output_sse[i], output_sse2[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_16s_aligned16.h b/volk/lib/qa_32f_convert_16s_aligned16.h
new file mode 100644
index 000000000..fce1eb417
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.cc b/volk/lib/qa_32f_convert_16s_unaligned16.cc
new file mode 100644
index 000000000..9469daed2
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_unaligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_16s_unaligned16.h>
+#include <volk/volk_32f_convert_16s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_16s_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_16s_unaligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_16s_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_16s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_16s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_16s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_16s_unaligned16.h b/volk/lib/qa_32f_convert_16s_unaligned16.h
new file mode 100644
index 000000000..492bc80e6
--- /dev/null
+++ b/volk/lib/qa_32f_convert_16s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_16s_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_16s_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_16S_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_32s_aligned16.cc b/volk/lib/qa_32f_convert_32s_aligned16.cc
new file mode 100644
index 000000000..ff24c7b0d
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_32s_aligned16.h>
+#include <volk/volk_32f_convert_32s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_32s_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_32s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  int32_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int32_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int32_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_32s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_32s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_32s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_32s_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_32s_aligned16.h b/volk/lib/qa_32f_convert_32s_aligned16.h
new file mode 100644
index 000000000..97d854463
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_32s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_32s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_32S_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.cc b/volk/lib/qa_32f_convert_32s_unaligned16.cc
new file mode 100644
index 000000000..e63b17994
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_unaligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_32s_unaligned16.h>
+#include <volk/volk_32f_convert_32s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_32s_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_32s_unaligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  int32_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int32_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int32_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_32s_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_32s_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_32s_unaligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_32s_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_32s_unaligned16.h b/volk/lib/qa_32f_convert_32s_unaligned16.h
new file mode 100644
index 000000000..5d662d86d
--- /dev/null
+++ b/volk/lib/qa_32f_convert_32s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_32s_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_32s_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_32S_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_64f_aligned16.cc b/volk/lib/qa_32f_convert_64f_aligned16.cc
new file mode 100644
index 000000000..c546e47de
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_64f_aligned16.h>
+#include <volk/volk_32f_convert_64f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_64f_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_64f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  double output_generic[vlen] __attribute__ ((aligned (16)));
+  double output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_64f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_64f_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_64f_aligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i] ,output_sse2[i], fabs(output_generic[i])*1e-6);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_64f_aligned16.h b/volk/lib/qa_32f_convert_64f_aligned16.h
new file mode 100644
index 000000000..41eb3e094
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_64f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_64f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_64F_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.cc b/volk/lib/qa_32f_convert_64f_unaligned16.cc
new file mode 100644
index 000000000..24b51f9af
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_64f_unaligned16.h>
+#include <volk/volk_32f_convert_64f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_64f_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_64f_unaligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  double output_generic[vlen] __attribute__ ((aligned (16)));
+  double output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_64f_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_64f_unaligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_64f_unaligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_64f_unaligned16.h b/volk/lib/qa_32f_convert_64f_unaligned16.h
new file mode 100644
index 000000000..4b144f033
--- /dev/null
+++ b/volk/lib/qa_32f_convert_64f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_64f_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_64f_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_64F_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_8s_aligned16.cc b/volk/lib/qa_32f_convert_8s_aligned16.cc
new file mode 100644
index 000000000..a3d4d6567
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_8s_aligned16.h>
+#include <volk/volk_32f_convert_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_8s_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_8s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int8_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_8s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_8s_aligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_8s_aligned16_manual(output_sse, input0, 128.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_8s_aligned16_manual(output_sse2, input0, 128.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_8s_aligned16.h b/volk/lib/qa_32f_convert_8s_aligned16.h
new file mode 100644
index 000000000..68a523f34
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_8s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_8s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.cc b/volk/lib/qa_32f_convert_8s_unaligned16.cc
new file mode 100644
index 000000000..d885fd6bb
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_unaligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32f_convert_8s_unaligned16.h>
+#include <volk/volk_32f_convert_8s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_convert_8s_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_convert_8s_unaligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int8_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int8_t output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_convert_8s_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_8s_unaligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_8s_unaligned16_manual(output_sse, input0, 128.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_convert_8s_unaligned16_manual(output_sse2, input0, 128.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse[i]) <= 1);
+    CPPUNIT_ASSERT(abs(output_generic[i] - output_sse2[i]) <= 1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_convert_8s_unaligned16.h b/volk/lib/qa_32f_convert_8s_unaligned16.h
new file mode 100644
index 000000000..88d4ff42a
--- /dev/null
+++ b/volk/lib/qa_32f_convert_8s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H
+#define INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_convert_8s_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_convert_8s_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_CONVERT_8S_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_divide_aligned16.cc b/volk/lib/qa_32f_divide_aligned16.cc
new file mode 100644
index 000000000..b20999beb
--- /dev/null
+++ b/volk/lib/qa_32f_divide_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_divide_aligned16.h>
+#include <volk/volk_32f_divide_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_divide_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_divide_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_divide_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_divide_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_divide_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_divide_aligned16.h b/volk/lib/qa_32f_divide_aligned16.h
new file mode 100644
index 000000000..79d5ae4b8
--- /dev/null
+++ b/volk/lib/qa_32f_divide_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_DIVIDE_ALIGNED16_H
+#define INCLUDED_QA_32F_DIVIDE_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_divide_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_divide_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_DIVIDE_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_dot_prod_aligned16.cc b/volk/lib/qa_32f_dot_prod_aligned16.cc
new file mode 100644
index 000000000..98c1f2d99
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_aligned16.cc
@@ -0,0 +1,183 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_dot_prod_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifdef LV_HAVE_SSE3
+void qa_32f_dot_prod_aligned16::t1() {
+  const int vlen = 2046;
+  const int ITER = 100000;
+
+  int i;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  float * input;
+  float * taps;
+  
+  float * result_generic;
+  float * result_sse;
+  float * result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen* sizeof(float));
+  ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+
+  random_floats((float*)input, vlen);
+  random_floats((float*)taps, vlen);
+  
+  
+  printf("32f_dot_prod_aligned16\n");
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  printf("generic: %f ... sse: %f  ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]);
+
+  for(i = 0; i < ITER; i++){
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse);
+  free(result_sse3);
+  
+}
+#else
+void qa_32f_dot_prod_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+#else
+
+void qa_32f_dot_prod_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  const int vlen = 4095;
+  const int ITER = 100000;
+
+  int i;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  float * input;
+  float * taps;
+  
+  float * result_generic;
+  float * result_sse;
+  float * result_sse3;
+  float * result_sse4_1;
+
+  ret = posix_memalign((void**)&input, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float));
+
+  random_floats((float*)input, vlen);
+  random_floats((float*)taps, vlen);
+  
+  printf("32f_dot_prod_aligned16\n");
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_aligned16_manual(&result_generic[i], input, taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_aligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_aligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    get_volk_runtime()->volk_32f_dot_prod_aligned16(&result_sse4_1[i], input, taps, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  //printf("generic: %f ... sse: %f  ... sse3 %f  ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]);
+  for(i =0; i < ITER; i++){
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse);
+  free(result_sse3);
+  free(result_sse4_1);
+  
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32f_dot_prod_aligned16.h b/volk/lib/qa_32f_dot_prod_aligned16.h
new file mode 100644
index 000000000..6931a9e98
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H
+#define INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_dot_prod_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_dot_prod_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_DOT_PROD_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.cc b/volk/lib/qa_32f_dot_prod_unaligned16.cc
new file mode 100644
index 000000000..8e97d4249
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_unaligned16.cc
@@ -0,0 +1,190 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_dot_prod_unaligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifdef LV_HAVE_SSE3
+void qa_32f_dot_prod_unaligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  const int vlen = 2046;
+  const int ITER = 100000;
+
+  int i;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  float * input;
+  float * taps;
+  
+  float * result_generic;
+  float * result_sse;
+  float * result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen* sizeof(float));
+  ret = posix_memalign((void**)&taps, 16, vlen *sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+
+  random_floats((float*)input, vlen);
+  random_floats((float*)taps, vlen);
+  
+  
+  printf("32f_dot_prod_unaligned16\n");
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  printf("generic: %f ... sse: %f  ... sse3 %f \n", result_generic[0], result_sse[0], result_sse3[0]);
+
+  for(i = 0; i < ITER; i++){
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse);
+  free(result_sse3);
+  
+}
+#else
+void qa_32f_dot_prod_unaligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+#else
+
+void qa_32f_dot_prod_unaligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  const int vlen = 4095;
+  const int ITER = 100000;
+
+  int i;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  float * input;
+  float * taps;
+  
+  float * result_generic;
+  float * result_sse;
+  float * result_sse3;
+  float * result_sse4_1;
+
+  ret = posix_memalign((void**)&input, 16, (vlen+1) * sizeof(float));
+  ret = posix_memalign((void**)&taps, 16, (vlen+1) * sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse3, 16, ITER*sizeof(float));
+  ret = posix_memalign((void**)&result_sse4_1, 16, ITER*sizeof(float));
+
+  input = &input[1]; // Make sure the buffer is unaligned
+  taps = &taps[1]; // Make sure the buffer is unaligned
+
+  random_floats((float*)input, vlen);
+  random_floats((float*)taps, vlen);
+  
+  printf("32f_dot_prod_unaligned16\n");
+  
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_unaligned16_manual(&result_generic[i], input, taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_unaligned16_manual(&result_sse[i], input, taps, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    volk_32f_dot_prod_unaligned16_manual(&result_sse3[i], input, taps, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  start = clock();
+  for(i = 0; i < ITER; i++){
+    get_volk_runtime()->volk_32f_dot_prod_unaligned16(&result_sse4_1[i], input, taps, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  //printf("generic: %f ... sse: %f  ... sse3 %f  ... sse4_1 %f \n", result_generic[0], result_sse[0], result_sse3[0], result_sse4_1[0]);
+  for(i =0; i < ITER; i++){
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse[i], fabs(result_generic[i])*ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse3[i], fabs(result_generic[i])*ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL (result_generic[i], result_sse4_1[i], fabs(result_generic[i])*ERR_DELTA);
+  }
+
+  free(&input[-1]);
+  free(&taps[-1]);
+  free(result_generic);
+  free(result_sse);
+  free(result_sse3);
+  free(result_sse4_1);
+  
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32f_dot_prod_unaligned16.h b/volk/lib/qa_32f_dot_prod_unaligned16.h
new file mode 100644
index 000000000..e8bad07fe
--- /dev/null
+++ b/volk/lib/qa_32f_dot_prod_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H
+#define INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_dot_prod_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_dot_prod_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_DOT_PROD_UNALIGNED16_H */
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.cc b/volk/lib/qa_32f_fm_detect_aligned16.cc
new file mode 100644
index 000000000..ca65add28
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <volk/volk_32f_fm_detect_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_fm_detect_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_fm_detect_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 10000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_fm_detect_aligned\n");
+
+  start = clock();
+  float save = 0.1;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  save = 0.1;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_fm_detect_aligned16.h b/volk/lib/qa_32f_fm_detect_aligned16.h
new file mode 100644
index 000000000..a2680c524
--- /dev/null
+++ b/volk/lib/qa_32f_fm_detect_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_index_max_aligned16.cc b/volk/lib/qa_32f_index_max_aligned16.cc
new file mode 100644
index 000000000..a1c3d4cd1
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.cc
@@ -0,0 +1,103 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3097
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] = uniform () * 32767;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_index_max_aligned16::t1(){
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_index_max_aligned16::t1(){
+ 
+  const int vlen = VEC_LEN;
+
+  
+  volk_runtime_init();
+  
+  volk_environment_init();
+  int ret;
+
+  unsigned int* target_sse4_1;
+  unsigned int* target_sse;
+  unsigned int* target_generic;
+  float* src0 ;
+  
+  
+  unsigned int i_target_sse4_1;
+  target_sse4_1 = &i_target_sse4_1;
+  unsigned int i_target_sse;
+  target_sse = &i_target_sse;
+  unsigned int i_target_generic;
+  target_generic = &i_target_generic;
+
+  ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
+  
+  random_floats((float*)src0, vlen);
+  
+  printf("32f_index_max_aligned16\n");
+
+  clock_t start, end;
+  double total;
+  
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
+  }
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse time: %f\n", total);
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    get_volk_runtime()->volk_32f_index_max_aligned16(target_sse4_1, src0, vlen);
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.1 time: %f\n", total);
+  
+  
+  printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
+  CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
+  CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
+  
+  free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_index_max_aligned16.h b/volk/lib/qa_32f_index_max_aligned16.h
new file mode 100644
index 000000000..8cadffa47
--- /dev/null
+++ b/volk/lib/qa_32f_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.cc b/volk/lib/qa_32f_interleave_16sc_aligned16.cc
new file mode 100644
index 000000000..2a937637f
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_16sc_aligned16.cc
@@ -0,0 +1,75 @@
+#include <volk/volk.h>
+#include <qa_32f_interleave_16sc_aligned16.h>
+#include <volk/volk_32f_interleave_16sc_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32f_interleave_16sc_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_interleave_16sc_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  std::complex<int16_t> output_generic[vlen] __attribute__ ((aligned (16)));
+  std::complex<int16_t> output_sse[vlen] __attribute__ ((aligned (16)));
+  std::complex<int16_t> output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); 
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); 
+  }
+  printf("32f_interleave_16sc_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_interleave_16sc_aligned16_manual(output_generic, input0, input1, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_interleave_16sc_aligned16_manual(output_sse, input0, input1, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_interleave_16sc_aligned16_manual(output_sse2, input0, input1, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), 1.01);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), 1.01);
+
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse2[i]), 1.01);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse2[i]), 1.01);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_interleave_16sc_aligned16.h b/volk/lib/qa_32f_interleave_16sc_aligned16.h
new file mode 100644
index 000000000..8d2914817
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_16sc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H
+#define INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_interleave_16sc_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_interleave_16sc_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INTERLEAVE_16SC_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.cc b/volk/lib/qa_32f_interleave_32fc_aligned16.cc
new file mode 100644
index 000000000..c22dd1046
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_32fc_aligned16.cc
@@ -0,0 +1,62 @@
+#include <volk/volk.h>
+#include <qa_32f_interleave_32fc_aligned16.h>
+#include <volk/volk_32f_interleave_32fc_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_interleave_32fc_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_interleave_32fc_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  std::complex<float> output_generic[vlen] __attribute__ ((aligned (16)));
+  std::complex<float> output_sse[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); 
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)); 
+  }
+  printf("32f_interleave_32fc_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_interleave_32fc_aligned16_manual(output_generic, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_interleave_32fc_aligned16_manual(output_sse, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(std::real(output_generic[i]), std::real(output_sse[i]), fabs(std::real(output_generic[i]))*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(std::imag(output_generic[i]), std::imag(output_sse[i]), fabs(std::imag(output_generic[i]))*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_interleave_32fc_aligned16.h b/volk/lib/qa_32f_interleave_32fc_aligned16.h
new file mode 100644
index 000000000..cba518d37
--- /dev/null
+++ b/volk/lib/qa_32f_interleave_32fc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H
+#define INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_interleave_32fc_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_interleave_32fc_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_INTERLEAVE_32FC_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_max_aligned16.cc b/volk/lib/qa_32f_max_aligned16.cc
new file mode 100644
index 000000000..3ef375176
--- /dev/null
+++ b/volk/lib/qa_32f_max_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_max_aligned16.h>
+#include <volk/volk_32f_max_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_max_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_max_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_max_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_max_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_max_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_max_aligned16.h b/volk/lib/qa_32f_max_aligned16.h
new file mode 100644
index 000000000..d535479f4
--- /dev/null
+++ b/volk/lib/qa_32f_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_MAX_ALIGNED16_H
+#define INCLUDED_QA_32F_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_max_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_max_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_min_aligned16.cc b/volk/lib/qa_32f_min_aligned16.cc
new file mode 100644
index 000000000..617e18b24
--- /dev/null
+++ b/volk/lib/qa_32f_min_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_min_aligned16.h>
+#include <volk/volk_32f_min_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_min_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_min_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_min_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_min_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_min_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_min_aligned16.h b/volk/lib/qa_32f_min_aligned16.h
new file mode 100644
index 000000000..90961ac92
--- /dev/null
+++ b/volk/lib/qa_32f_min_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_MIN_ALIGNED16_H
+#define INCLUDED_QA_32F_MIN_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_min_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_min_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_MIN_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_multiply_aligned16.cc b/volk/lib/qa_32f_multiply_aligned16.cc
new file mode 100644
index 000000000..c77fe97da
--- /dev/null
+++ b/volk/lib/qa_32f_multiply_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_multiply_aligned16.h>
+#include <volk/volk_32f_multiply_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_multiply_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_multiply_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_multiply_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_multiply_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_multiply_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_multiply_aligned16.h b/volk/lib/qa_32f_multiply_aligned16.h
new file mode 100644
index 000000000..7032a2ad4
--- /dev/null
+++ b/volk/lib/qa_32f_multiply_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H
+#define INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_multiply_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_multiply_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_MULTIPLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_normalize_aligned16.cc b/volk/lib/qa_32f_normalize_aligned16.cc
new file mode 100644
index 000000000..2954fc3ae
--- /dev/null
+++ b/volk/lib/qa_32f_normalize_aligned16.cc
@@ -0,0 +1,65 @@
+#include <volk/volk.h>
+#include <qa_32f_normalize_aligned16.h>
+#include <volk/volk_32f_normalize_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_normalize_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_normalize_aligned16::t1() {
+  
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  const int vlen = 320001;
+  const int ITERS = 100;
+
+  float* output0;
+  float* output01;
+  ret = posix_memalign((void**)&output0, 16, vlen*sizeof(float));
+  ret = posix_memalign((void**)&output01, 16, vlen*sizeof(float));
+
+  for(int i = 0; i < vlen; ++i) {   
+    output0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  memcpy(output01, output0, vlen*sizeof(float));
+  printf("32f_normalize_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_normalize_aligned16_manual(output0, 1.15, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_normalize_aligned16_manual(output01, 1.15, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    // printf("%e...%e\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+  }
+
+  free(output0);
+  free(output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32f_normalize_aligned16.h b/volk/lib/qa_32f_normalize_aligned16.h
new file mode 100644
index 000000000..7c421eb82
--- /dev/null
+++ b/volk/lib/qa_32f_normalize_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H
+#define INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_normalize_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_normalize_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_NORMALIZE_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_power_aligned16.cc b/volk/lib/qa_32f_power_aligned16.cc
new file mode 100644
index 000000000..1b331daeb
--- /dev/null
+++ b/volk/lib/qa_32f_power_aligned16.cc
@@ -0,0 +1,95 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_power_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE
+void qa_32f_power_aligned16::t1() {
+
+  
+  volk_runtime_init();
+
+  const int vlen = 2046;
+  const int ITERS = 10000;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  float* input;
+  int i;
+  
+  float* result_generic;
+  float* result_sse;
+  float* result_sse4_1;
+
+  ret = posix_memalign((void**)&input, 16, vlen *  sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&result_sse, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&result_sse4_1, 16, vlen * sizeof(float));
+
+  random_floats((float*)input, vlen);
+
+  const float power = 3;
+  
+  printf("32f_power_aligned16\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_power_aligned16_manual(result_generic, input, power, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_power_aligned16_manual(result_sse, input, power, vlen,  "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_32f_power_aligned16(result_sse4_1, input, power, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.1_time: %f\n", total);
+
+
+  for(i = 0; i < vlen; i++){
+    //printf("%d %e -> %e %e %e\n", i, input[i], result_generic[i], result_sse[i], result_sse4_1[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse[i], fabs(result_generic[i])* ERR_DELTA);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(result_generic[i], result_sse4_1[i], fabs(result_generic[i])* ERR_DELTA);
+  }
+
+  free(input);
+  free(result_generic);
+  free(result_sse);
+  
+}
+#else
+void qa_32f_power_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE */
+
diff --git a/volk/lib/qa_32f_power_aligned16.h b/volk/lib/qa_32f_power_aligned16.h
new file mode 100644
index 000000000..d45df4e56
--- /dev/null
+++ b/volk/lib/qa_32f_power_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_POWER_ALIGNED16_H
+#define INCLUDED_QA_32F_POWER_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_power_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_power_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_POWER_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_sqrt_aligned16.cc b/volk/lib/qa_32f_sqrt_aligned16.cc
new file mode 100644
index 000000000..a3e6abc18
--- /dev/null
+++ b/volk/lib/qa_32f_sqrt_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_32f_sqrt_aligned16.h>
+#include <volk/volk_32f_sqrt_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_sqrt_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_sqrt_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  // No reason to test negative numbers because they result in NaN.
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand()) / static_cast<float>(RAND_MAX));
+  }
+  printf("32f_sqrt_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_sqrt_aligned16_manual(output0, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_sqrt_aligned16_manual(output01, input0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_sqrt_aligned16.h b/volk/lib/qa_32f_sqrt_aligned16.h
new file mode 100644
index 000000000..e4b99d981
--- /dev/null
+++ b/volk/lib/qa_32f_sqrt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_SQRT_ALIGNED16_H
+#define INCLUDED_QA_32F_SQRT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_sqrt_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_sqrt_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_SQRT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_stddev_aligned16.cc b/volk/lib/qa_32f_stddev_aligned16.cc
new file mode 100644
index 000000000..c0f22cdea
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_aligned16.cc
@@ -0,0 +1,74 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_stddev_aligned16.h>
+#include <volk/volk_32f_stddev_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_stddev_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_stddev_aligned16::t1() {
+  volk_runtime_init();  
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+
+  float stddev_generic;
+  float stddev_sse;
+  float stddev_sse4_1;
+  float mean = 0;
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    mean += input0[i];
+  }
+  mean /= static_cast<float>(vlen);
+
+  printf("32f_stddev_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_stddev_aligned16_manual(&stddev_generic, input0, mean, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_stddev_aligned16_manual(&stddev_sse, input0, mean, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_32f_stddev_aligned16(&stddev_sse4_1, input0, mean, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  //printf("%d...%d\n", output0[i], output01[i]);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4);
+
+}
+
+#endif
diff --git a/volk/lib/qa_32f_stddev_aligned16.h b/volk/lib/qa_32f_stddev_aligned16.h
new file mode 100644
index 000000000..7f8d7a5fc
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_STDDEV_ALIGNED16_H
+#define INCLUDED_QA_32F_STDDEV_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_stddev_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_stddev_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_STDDEV_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.cc b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc
new file mode 100644
index 000000000..dcad8bcf3
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.cc
@@ -0,0 +1,75 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32f_stddev_and_mean_aligned16.h>
+#include <volk/volk_32f_stddev_and_mean_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_stddev_and_mean_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_stddev_and_mean_aligned16::t1() {
+  volk_runtime_init();  
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  
+  float stddev_generic;
+  float stddev_sse;
+  float stddev_sse4_1;
+  float mean_generic;
+  float mean_sse;
+  float mean_sse4_1;
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_stddev_and_mean_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_stddev_and_mean_aligned16_manual(&stddev_generic, &mean_generic, input0,vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_stddev_and_mean_aligned16_manual(&stddev_sse, &mean_sse, input0,vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_32f_stddev_and_mean_aligned16(&stddev_sse4_1, &mean_sse4_1, input0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse, fabs(stddev_generic)*1e-4);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse, fabs(mean_generic)*1e-4);
+
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(stddev_generic, stddev_sse4_1, fabs(stddev_generic)*1e-4);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(mean_generic, mean_sse4_1, fabs(mean_generic)*1e-4);
+
+}
+
+#endif
diff --git a/volk/lib/qa_32f_stddev_and_mean_aligned16.h b/volk/lib/qa_32f_stddev_and_mean_aligned16.h
new file mode 100644
index 000000000..e08bd249a
--- /dev/null
+++ b/volk/lib/qa_32f_stddev_and_mean_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H
+#define INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_stddev_and_mean_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_stddev_and_mean_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_STDDEV_AND_MEAN_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_subtract_aligned16.cc b/volk/lib/qa_32f_subtract_aligned16.cc
new file mode 100644
index 000000000..a7e1b5ae3
--- /dev/null
+++ b/volk/lib/qa_32f_subtract_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32f_subtract_aligned16.h>
+#include <volk/volk_32f_subtract_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32f_subtract_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32f_subtract_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  float input0[vlen] __attribute__ ((aligned (16)));
+  float input1[vlen] __attribute__ ((aligned (16)));
+  
+  float output0[vlen] __attribute__ ((aligned (16)));
+  float output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+    input1[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
+  }
+  printf("32f_subtract_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_subtract_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32f_subtract_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32f_subtract_aligned16.h b/volk/lib/qa_32f_subtract_aligned16.h
new file mode 100644
index 000000000..97c14f129
--- /dev/null
+++ b/volk/lib/qa_32f_subtract_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H
+#define INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_subtract_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_subtract_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_SUBTRACT_ALIGNED16_H */
diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.cc b/volk/lib/qa_32f_sum_of_poly_aligned16.cc
new file mode 100644
index 000000000..494776357
--- /dev/null
+++ b/volk/lib/qa_32f_sum_of_poly_aligned16.cc
@@ -0,0 +1,142 @@
+#include <volk/volk.h>
+#include <qa_32f_sum_of_poly_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <math.h>
+
+#define SNR 30.0
+#define CENTER -4.0
+#define CUTOFF -5.595
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 100000
+#define VEC_LEN 64
+static float uniform() {
+  return ((float) rand() / RAND_MAX);	// uniformly (0, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] =  uniform () * -SNR/2.0;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32f_sum_of_poly_aligned16::t1(){
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32f_sum_of_poly_aligned16::t1(){
+  int i = 0;
+  
+  volk_environment_init();
+  int ret;
+
+  const int vlen = VEC_LEN;
+  float cutoff = CUTOFF;
+  
+  float* center_point_array;
+  float* target;
+  float* target_generic;
+  float* src0 ;
+
+
+  ret = posix_memalign((void**)&center_point_array, 16, 24);
+  ret = posix_memalign((void**)&target, 16, 4);
+  ret = posix_memalign((void**)&target_generic, 16, 4);
+  ret = posix_memalign((void**)&src0, 16, (vlen << 2));
+  
+ 
+  random_floats((float*)src0, vlen);
+ 
+  float a = (float)CENTER;
+  float etoa = expf(a);
+  center_point_array[0] = (//(5.0 * a * a * a * a)/120.0 +
+			   (-4.0 * a * a * a)/24.0 + 
+			   (3.0 * a * a)/6.0 +
+			   (-2.0 * a)/2.0 +
+			   (1.0)) * etoa;
+  center_point_array[1] = (//(-10.0 * a * a * a)/120.0 +
+			   (6.0 * a * a)/24.0 + 
+			   (-3.0 * a)/6.0 +
+			   (1.0/2.0)) * etoa;
+  center_point_array[2] = (//(10.0 * a * a)/120.0 +
+			   (-4.0 * a)/24.0 +
+			   (1.0/6.0)) * etoa;
+  center_point_array[3] = (//(-5.0 * a)/120.0 +
+			   (1.0/24.0)) * etoa;
+  //center_point_array[4] = ((1.0)/120.0) * etoa;
+  center_point_array[4] = (//(a * a * a * a * a)/120.0 +
+			   (a * a * a * a)/24.0 +
+			   (a * a * a)/-6.0 +
+			   (a * a)/2.0 +
+			   -a + 1.0) * etoa;
+  
+  printf("32f_sum_of_poly_aligned16\n");
+
+  clock_t start, end;
+  double total;
+  
+  float my_sum = 0.0;
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    float sum = 0.0;
+    for(int l = 0; l < vlen; ++l) {
+      
+      sum += expf(src0[l]);
+      
+    }
+    my_sum = sum;
+  }
+  
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("exp time: %f\n", total);
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    
+    volk_32f_sum_of_poly_aligned16_manual(target_generic, src0, center_point_array, &cutoff, vlen << 2, "generic");
+  
+  }
+  
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32f_sum_of_poly_aligned16_manual(target, src0, center_point_array, &cutoff, vlen << 2, "sse3");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3 approx time: %f\n", total);
+
+
+  
+  printf("exp: %f, sse3: %f\n", my_sum, target[i]);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], fabs(target_generic[0]) * ERR_DELTA);
+  
+
+  free(center_point_array);
+  free(target);
+  free(target_generic);
+  free(src0);
+
+  
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32f_sum_of_poly_aligned16.h b/volk/lib/qa_32f_sum_of_poly_aligned16.h
new file mode 100644
index 000000000..67a347f9a
--- /dev/null
+++ b/volk/lib/qa_32f_sum_of_poly_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H
+#define INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32f_sum_of_poly_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32f_sum_of_poly_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32F_SUM_OF_POLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.cc b/volk/lib/qa_32fc_32f_multiply_aligned16.cc
new file mode 100644
index 000000000..4eba0a3cd
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_multiply_aligned16.cc
@@ -0,0 +1,85 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_32f_multiply_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE3
+void qa_32fc_32f_multiply_aligned16::t1() {
+
+  const int vlen = 2046;
+  const int ITERS = 100000;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<float>* input;
+  float * taps;
+  int i;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float));
+  ret = posix_memalign((void**)&taps, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float));
+  ret = posix_memalign((void**)&result_sse3, 16, vlen * 2 * sizeof(float));
+
+  random_floats((float*)input, vlen * 2);
+  random_floats(taps, vlen);
+  
+  printf("32fc_32f_multiply_aligned16\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_32f_multiply_aligned16_manual(result_generic, input, taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_32f_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(i = 0; i < vlen; i++){
+    assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse3);
+  
+}
+#else
+void qa_32fc_32f_multiply_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
+
diff --git a/volk/lib/qa_32fc_32f_multiply_aligned16.h b/volk/lib/qa_32fc_32f_multiply_aligned16.h
new file mode 100644
index 000000000..fc3b3eeb2
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_multiply_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H
+#define INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_32f_multiply_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_32f_multiply_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_32F_MULTIPLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc
new file mode 100644
index 000000000..64ea65da9
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.cc
@@ -0,0 +1,83 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_32f_power_32fc_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1.5e-3)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE
+void qa_32fc_32f_power_32fc_aligned16::t1() {
+
+  const int vlen = 2046;
+  const int ITERS = 10000;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<float>* input;
+  int i;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse;
+
+  ret = posix_memalign((void**)&input, 16, vlen * 2 * sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, vlen * 2 * sizeof(float));
+  ret = posix_memalign((void**)&result_sse, 16, vlen * 2 * sizeof(float));
+
+  random_floats((float*)input, vlen * 2);
+
+  const float power = 3.2;
+  
+  printf("32fc_32f_power_32fc_aligned16\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_32f_power_32fc_aligned16_manual(result_generic, input, power, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_32f_power_32fc_aligned16_manual(result_sse, input, power, vlen,  "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(i = 0; i < vlen; i++){
+    assertcomplexEqual(result_generic[i], result_sse[i], ERR_DELTA);
+  }
+
+  free(input);
+  free(result_generic);
+  free(result_sse);
+  
+}
+#else
+void qa_32fc_32f_power_32fc_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE */
+
diff --git a/volk/lib/qa_32fc_32f_power_32fc_aligned16.h b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h
new file mode 100644
index 000000000..464b7b7cc
--- /dev/null
+++ b/volk/lib/qa_32fc_32f_power_32fc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H
+#define INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_32f_power_32fc_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_32f_power_32fc_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_32F_POWER_32FC_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.cc b/volk/lib/qa_32fc_atan2_32f_aligned16.cc
new file mode 100644
index 000000000..a24382d71
--- /dev/null
+++ b/volk/lib/qa_32fc_atan2_32f_aligned16.cc
@@ -0,0 +1,75 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_atan2_32f_aligned16.h>
+#include <volk/volk_32fc_atan2_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_atan2_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_atan2_32f_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 10000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_atan2_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_atan2_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_atan2_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_32fc_atan2_32f_aligned16(output_sse4_1, input0, 32768.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_atan2_32f_aligned16.h b/volk/lib/qa_32fc_atan2_32f_aligned16.h
new file mode 100644
index 000000000..9c4dc209a
--- /dev/null
+++ b/volk/lib/qa_32fc_atan2_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_atan2_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_atan2_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_ATAN2_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc
new file mode 100644
index 000000000..497914e0a
--- /dev/null
+++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.cc
@@ -0,0 +1,137 @@
+#include <volk/volk.h>
+#include <qa_32fc_conjugate_dot_prod_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_conjugate_dot_prod_aligned16::t1() {
+  const int vlen = 789743;
+  
+  volk_environment_init();
+  int ret;
+
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  
+
+  volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8,  "generic");
+
+  
+  volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse");
+
+  printf("32fc_conjugate_dot_prod_aligned16\n");
+  printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+  assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result);
+  
+}
+
+
+#elif LV_HAVE_SSE && LV_HAVE_32
+
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform () * 32767;
+}
+
+
+void qa_32fc_conjugate_dot_prod_aligned16::t1() {
+  const int vlen = 789743;
+  
+  volk_environment_init();
+  int ret;
+
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  
+
+  volk_32fc_conjugate_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8,  "generic");
+
+  
+  volk_32fc_conjugate_dot_prod_aligned16_manual(result, input, taps, vlen * 8, "sse_32");
+
+  printf("32fc_conjugate_dot_prod_aligned16\n");
+  printf("generic: %f +i%f ... sse: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result[0]), std::imag(result[0]));
+
+  assertcomplexEqual(result_generic[0], result[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result);
+  
+}
+
+
+#else
+
+void qa_32fc_conjugate_dot_prod_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#endif /*LV_HAVE_SSE*/
diff --git a/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h
new file mode 100644
index 000000000..507b1769b
--- /dev/null
+++ b/volk/lib/qa_32fc_conjugate_dot_prod_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H
+#define INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_conjugate_dot_prod_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_conjugate_dot_prod_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_CONJUGATE_DOT_PROD_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc
new file mode 100644
index 000000000..0f5a030f5
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_32f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_deinterleave_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_generic1[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse1[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_deinterleave_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic1[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..78660e6ad
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc
new file mode 100644
index 000000000..6e051afbc
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_64f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_64f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32fc_deinterleave_64f_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_64f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  double output_generic[vlen] __attribute__ ((aligned (16)));
+  double output_generic1[vlen] __attribute__ ((aligned (16)));
+  double output_sse2[vlen] __attribute__ ((aligned (16)));
+  double output_sse21[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_deinterleave_64f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_64f_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_64f_aligned16_manual(output_sse2, output_sse21, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse21[i], fabs(output_generic1[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h
new file mode 100644
index 000000000..f924b9752
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_64f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_64f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_64f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_64F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc
new file mode 100644
index 000000000..850518524
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_real_16s_aligned16.h>
+#include <volk/volk_32fc_deinterleave_real_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_deinterleave_real_16s_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_real_16s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_deinterleave_real_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_real_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_real_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..68b80f27d
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc
new file mode 100644
index 000000000..321deb184
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_real_32f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_real_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32fc_deinterleave_real_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_real_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_deinterleave_real_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_real_32f_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_real_32f_aligned16_manual(output_sse, input0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..765450bb6
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc
new file mode 100644
index 000000000..aedb2e387
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32fc_deinterleave_real_64f_aligned16.h>
+#include <volk/volk_32fc_deinterleave_real_64f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32fc_deinterleave_real_64f_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_deinterleave_real_64f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  double output_generic[vlen] __attribute__ ((aligned (16)));
+  double output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_deinterleave_real_64f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_real_64f_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_deinterleave_real_64f_aligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse2[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h
new file mode 100644
index 000000000..3e55fb812
--- /dev/null
+++ b/volk/lib/qa_32fc_deinterleave_real_64f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+#define INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_deinterleave_real_64f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_deinterleave_real_64f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DEINTERLEAVE_REAL_64F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.cc b/volk/lib/qa_32fc_dot_prod_aligned16.cc
new file mode 100644
index 000000000..bcf9ea954
--- /dev/null
+++ b/volk/lib/qa_32fc_dot_prod_aligned16.cc
@@ -0,0 +1,214 @@
+#include <volk/volk.h>
+#include <qa_32fc_dot_prod_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+
+
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-4)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+
+
+#if LV_HAVE_SSE3
+void qa_32fc_dot_prod_aligned16::t1() {
+
+  const int vlen = 2046;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result_sse3, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result_sse3[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  printf("32fc_dot_prod_aligned16\n");
+  
+  start = clock();
+  volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8,  "generic");
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  
+  start = clock();
+  volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse3");
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  printf("generic: %f +i%f ... sse3: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0]));
+
+  
+  assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse3);
+  
+}
+
+#else
+void qa_32fc_dot_prod_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+void qa_32fc_dot_prod_aligned16::t2() {
+
+  const int vlen = 2046;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result_sse3, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result_sse3[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  printf("32fc_dot_prod_aligned16\n");
+  
+  start = clock();
+  volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8,  "generic");
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  
+  start = clock();
+  volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_32");
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_32_time: %f\n", total);
+
+  printf("generic: %f +i%f ... sse_32: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0]));
+
+  
+  assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse3);
+  
+}
+
+#else
+void qa_32fc_dot_prod_aligned16::t2() {
+  printf("sse_32 not available... no test performed\n");
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+void qa_32fc_dot_prod_aligned16::t3() {
+
+  const int vlen = 2046;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen << 3);
+  ret = posix_memalign((void**)&taps, 16, vlen << 3);
+  ret = posix_memalign((void**)&result_generic, 16, 8);
+  ret = posix_memalign((void**)&result_sse3, 16, 8);
+  
+
+  result_generic[0] = std::complex<float>(0,0);
+  result_sse3[0] = std::complex<float>(0,0);
+
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  printf("32fc_dot_prod_aligned16\n");
+  
+  start = clock();
+  volk_32fc_dot_prod_aligned16_manual(result_generic, input, taps, vlen * 8,  "generic");
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  
+  start = clock();
+  volk_32fc_dot_prod_aligned16_manual(result_sse3, input, taps, vlen * 8, "sse_64");
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_64_time: %f\n", total);
+
+  printf("generic: %f +i%f ... sse_64: %f +i%f\n", std::real(result_generic[0]), std::imag(result_generic[0]), std::real(result_sse3[0]), std::imag(result_sse3[0]));
+
+  
+  assertcomplexEqual(result_generic[0], result_sse3[0], ERR_DELTA);
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse3);
+  
+}
+
+#else
+void qa_32fc_dot_prod_aligned16::t3() {
+  printf("sse_64 not available... no test performed\n");
+}
+
+
+
+#endif 
diff --git a/volk/lib/qa_32fc_dot_prod_aligned16.h b/volk/lib/qa_32fc_dot_prod_aligned16.h
new file mode 100644
index 000000000..4b360db27
--- /dev/null
+++ b/volk/lib/qa_32fc_dot_prod_aligned16.h
@@ -0,0 +1,20 @@
+#ifndef INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H
+#define INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_dot_prod_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_dot_prod_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+  void t2 ();
+  void t3 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_DOT_PROD_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_index_max_aligned16.cc b/volk/lib/qa_32fc_index_max_aligned16.cc
new file mode 100644
index 000000000..4d83f1639
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.cc
@@ -0,0 +1,89 @@
+#include <volk/volk.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 1000000
+#define VEC_LEN 3096
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] = uniform () * 32767;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_index_max_aligned16::t1(){
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_index_max_aligned16::t1(){
+ 
+  const int vlen = VEC_LEN;
+  
+  volk_environment_init();
+  int ret;
+  
+  unsigned int* target;
+  unsigned int* target_generic;
+  std::complex<float>* src0 ;
+  
+  
+  unsigned int i_target;
+  target = &i_target;
+  unsigned int i_target_generic;
+  target_generic = &i_target_generic;
+  ret = posix_memalign((void**)&src0, 16, vlen << 3);
+  
+  random_floats((float*)src0, vlen * 2);
+  
+  printf("32fc_index_max_aligned16\n");
+
+  clock_t start, end;
+  double total;
+  
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
+  }
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+  volk_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3 time: %f\n", total);
+
+  
+  
+  
+  printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
+  CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
+  
+
+  
+  free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_index_max_aligned16.h b/volk/lib/qa_32fc_index_max_aligned16.h
new file mode 100644
index 000000000..0990bcb1f
--- /dev/null
+++ b/volk/lib/qa_32fc_index_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.cc b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
new file mode 100644
index 000000000..a4be1616b
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32fc_magnitude_16s_aligned16.h>
+#include <volk/volk_32fc_magnitude_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_magnitude_16s_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_magnitude_16s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_magnitude_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_16s_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_16s_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_16s_aligned16_manual(output_sse3, input0, 32768.0, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], 1.1);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], 1.1);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_magnitude_16s_aligned16.h b/volk/lib/qa_32fc_magnitude_16s_aligned16.h
new file mode 100644
index 000000000..ffdf1dd9e
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H
+#define INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_magnitude_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_magnitude_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_MAGNITUDE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.cc b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
new file mode 100644
index 000000000..d69ada408
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.cc
@@ -0,0 +1,70 @@
+#include <volk/volk.h>
+#include <qa_32fc_magnitude_32f_aligned16.h>
+#include <volk/volk_32fc_magnitude_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_magnitude_32f_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_magnitude_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_magnitude_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_32f_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_32f_aligned16_manual(output_sse, input0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_magnitude_32f_aligned16_manual(output_sse3, input0, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_magnitude_32f_aligned16.h b/volk/lib/qa_32fc_magnitude_32f_aligned16.h
new file mode 100644
index 000000000..a2881308c
--- /dev/null
+++ b/volk/lib/qa_32fc_magnitude_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_magnitude_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_magnitude_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_MAGNITUDE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_multiply_aligned16.cc b/volk/lib/qa_32fc_multiply_aligned16.cc
new file mode 100644
index 000000000..e1f7eab3d
--- /dev/null
+++ b/volk/lib/qa_32fc_multiply_aligned16.cc
@@ -0,0 +1,86 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32fc_multiply_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-3)
+
+//test for sse
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  for (unsigned i = 0; i < n; i++)
+    buf[i] = uniform ();
+}
+
+#ifdef LV_HAVE_SSE3
+void qa_32fc_multiply_aligned16::t1() {
+
+  const int vlen = 2046;
+  const int ITERS = 100000;
+
+  int i;
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<float>* input;
+  std::complex<float>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse3;
+
+  ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(float));
+  ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(float));
+  ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float));
+  ret = posix_memalign((void**)&result_sse3, 16, vlen*2*sizeof(float));
+  
+  random_floats((float*)input, vlen * 2);
+  random_floats((float*)taps, vlen * 2);
+  
+  printf("32fc_multiply_aligned16\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_multiply_aligned16_manual(result_generic, input, taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_multiply_aligned16_manual(result_sse3, input, taps, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(i = 0; i < vlen; i++){
+    assertcomplexEqual(result_generic[i], result_sse3[i], ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse3);
+  
+}
+#else
+void qa_32fc_multiply_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#endif /* LV_HAVE_SSE3 */
diff --git a/volk/lib/qa_32fc_multiply_aligned16.h b/volk/lib/qa_32fc_multiply_aligned16.h
new file mode 100644
index 000000000..c8abaa8fe
--- /dev/null
+++ b/volk/lib/qa_32fc_multiply_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H
+#define INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_multiply_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_multiply_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_MULTIPLY_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
new file mode 100644
index 000000000..83cdf4b15
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectral_density_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectral_density_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 10000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  const float scalar = vlen;
+  const float rbw = 1.7;
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+  printf("32fc_power_spectral_density_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
new file mode 100644
index 000000000..26f430bec
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectral_density_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc
new file mode 100644
index 000000000..4d1359068
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk.h>
+#include <qa_32fc_power_spectrum_32f_aligned16.h>
+#include <volk/volk_32fc_power_spectrum_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse3
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_power_spectrum_32f_aligned16::t1() {
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_32fc_power_spectrum_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 10000;
+  std::complex<float> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse3[vlen] __attribute__ ((aligned (16)));
+
+  const float scalar = vlen;
+
+  float* inputLoad = (float*)input0;
+  for(int i = 0; i < 2*vlen; ++i) {   
+    inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
+  }
+
+  printf("32fc_power_spectrum_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_power_spectrum_32f_aligned16_manual(output_generic, input0, scalar, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32fc_power_spectrum_32f_aligned16_manual(output_sse3, input0, scalar, vlen, "sse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse33... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h
new file mode 100644
index 000000000..d991223f3
--- /dev/null
+++ b/volk/lib/qa_32fc_power_spectrum_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H
+#define INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_power_spectrum_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_power_spectrum_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_POWER_SPECTRUM_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_square_dist_aligned16.cc b/volk/lib/qa_32fc_square_dist_aligned16.cc
new file mode 100644
index 000000000..d9ead8495
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_aligned16.cc
@@ -0,0 +1,91 @@
+#include <volk/volk.h>
+#include <qa_32fc_square_dist_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define ERR_DELTA (1e-4)
+#define NUM_ITERS 10000000
+#define VEC_LEN 64
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] = uniform () * 32767;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_square_dist_aligned16::t1(){
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_square_dist_aligned16::t1(){
+  int i = 0;
+  
+  const int vlen = VEC_LEN;
+  volk_environment_init();
+  int ret;
+  
+  float* target;
+  float* target_generic;
+  std::complex<float>* src0 ;
+  std::complex<float>* points;
+
+  ret = posix_memalign((void**)&points, 16, vlen << 3);
+  ret = posix_memalign((void**)&target, 16, vlen << 2);
+  ret = posix_memalign((void**)&target_generic, 16, vlen << 2);
+  ret = posix_memalign((void**)&src0, 16, 8);
+  
+  random_floats((float*)points, vlen * 2);
+  random_floats((float*)src0, 2);
+  
+  printf("32fc_square_dist_aligned16\n");
+  
+  clock_t start, end;
+  double total;
+  
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32fc_square_dist_aligned16_manual(target_generic, src0, points, vlen << 3, "generic");
+  }
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+  volk_32fc_square_dist_aligned16_manual(target, src0, points, vlen << 3, "sse3");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3 time: %f\n", total);
+
+  
+  
+  for(; i < vlen; ++i) {
+    //printf("generic: %f, sse3: %f\n", target_generic[i], target[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[i], target[i], fabs(target_generic[i]) * ERR_DELTA);
+  }
+
+  free(target);
+  free(target_generic);
+  free(points);
+  free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_square_dist_aligned16.h b/volk/lib/qa_32fc_square_dist_aligned16.h
new file mode 100644
index 000000000..9d365d8b0
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H
+#define INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_square_dist_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_square_dist_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_SQUARE_DIST_ALIGNED16_H */
diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc
new file mode 100644
index 000000000..f923d1d5c
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.cc
@@ -0,0 +1,96 @@
+#include <volk/volk.h>
+#include <qa_32fc_square_dist_scalar_mult_aligned16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define ERR_DELTA .0001
+#define NUM_ITERS 10000000
+#define VEC_LEN 64
+
+static float uniform() {
+  return 2.0 * ((float) rand() / RAND_MAX - 0.5);	// uniformly (-1, 1)
+}
+
+static void
+random_floats (float *buf, unsigned n)
+{
+  unsigned int i = 0;
+  for (; i < n; i++) {
+
+    buf[i] = uniform () * 32767;
+
+  }
+}
+
+
+#ifndef LV_HAVE_SSE3
+
+void qa_32fc_square_dist_scalar_mult_aligned16::t1(){
+  printf("sse3 not available... no test performed\n");
+}
+
+#else
+
+
+void qa_32fc_square_dist_scalar_mult_aligned16::t1(){
+  int i = 0;
+  
+  const int vlen = VEC_LEN;
+  
+  volk_environment_init();
+  int ret;
+  
+  float* target;
+  float* target_generic;
+  std::complex<float>* src0 ;
+  std::complex<float>* points;
+  float scalar;
+
+  ret = posix_memalign((void**)&points, 16, vlen << 3);
+  ret = posix_memalign((void**)&target, 16, vlen << 2);
+  ret = posix_memalign((void**)&target_generic, 16, vlen << 2);
+  ret = posix_memalign((void**)&src0, 16, 8);
+  
+  random_floats((float*)points, vlen * 2);
+  random_floats((float*)src0, 2);
+  random_floats(&scalar, 1);
+  
+  printf("32fc_square_dist_scalar_mult_aligned16\n");
+  
+  clock_t start, end;
+  double total;
+  
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32fc_square_dist_scalar_mult_aligned16_manual(target_generic, src0, points, scalar, vlen << 3, "generic");
+  }
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic time: %f\n", total);
+  
+  start = clock();
+  for(int k = 0; k < NUM_ITERS; ++k) {
+    volk_32fc_square_dist_scalar_mult_aligned16_manual(target, src0, points, scalar, vlen << 3, "sse3");
+  }
+  
+  end = clock();  
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse3 time: %f\n", total);
+
+  
+  
+  for(i = 0; i < vlen; ++i) {
+    printf("generic: %f, sse3: %f\n", target_generic[i], target[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(target[i], target_generic[i], fabs(target_generic[1]) * ERR_DELTA);//, target_generic[1] * ERR_DELTA);
+  }
+
+  free(target);
+  free(target_generic);
+  free(points);
+  free(src0);
+}
+
+#endif /*LV_HAVE_SSE3*/
diff --git a/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h
new file mode 100644
index 000000000..ac4e3c45b
--- /dev/null
+++ b/volk/lib/qa_32fc_square_dist_scalar_mult_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
+#define INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32fc_square_dist_scalar_mult_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32fc_square_dist_scalar_mult_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32FC_SQUARE_DIST_SCALAR_MULT_ALIGNED16_H */
diff --git a/volk/lib/qa_32s_and_aligned16.cc b/volk/lib/qa_32s_and_aligned16.cc
new file mode 100644
index 000000000..72d05cf6f
--- /dev/null
+++ b/volk/lib/qa_32s_and_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_and_aligned16.h>
+#include <volk/volk_32s_and_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32s_and_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_and_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int32_t input0[vlen] __attribute__ ((aligned (16)));
+  int32_t input1[vlen] __attribute__ ((aligned (16)));
+  
+  int32_t output0[vlen] __attribute__ ((aligned (16)));
+  int32_t output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+    input1[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+  }
+  printf("32s_and_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_and_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_and_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_and_aligned16.h b/volk/lib/qa_32s_and_aligned16.h
new file mode 100644
index 000000000..dfcb47c63
--- /dev/null
+++ b/volk/lib/qa_32s_and_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_AND_ALIGNED16_H
+#define INCLUDED_QA_32S_AND_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_and_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32s_and_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_AND_ALIGNED16_H */
diff --git a/volk/lib/qa_32s_convert_32f_aligned16.cc b/volk/lib/qa_32s_convert_32f_aligned16.cc
new file mode 100644
index 000000000..eab3fe016
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_convert_32f_aligned16.h>
+#include <volk/volk_32s_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32s_convert_32f_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_convert_32f_aligned16::t1() {
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+
+  int32_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+  }
+  printf("32s_convert_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_convert_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_convert_32f_aligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_convert_32f_aligned16.h b/volk/lib/qa_32s_convert_32f_aligned16.h
new file mode 100644
index 000000000..efd2a2eea
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_convert_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32s_convert_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.cc b/volk/lib/qa_32s_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..0e504cfa1
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_convert_32f_unaligned16.h>
+#include <volk/volk_32s_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32s_convert_32f_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_convert_32f_unaligned16::t1() {
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+
+  int32_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int32_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 32768.0));
+  }
+  printf("32s_convert_32f_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_convert_32f_unaligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_convert_32f_unaligned16_manual(output_sse2, input0, 32768.0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_convert_32f_unaligned16.h b/volk/lib/qa_32s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..5006f5fd8
--- /dev/null
+++ b/volk/lib/qa_32s_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32s_convert_32f_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_32s_or_aligned16.cc b/volk/lib/qa_32s_or_aligned16.cc
new file mode 100644
index 000000000..e09dfb91c
--- /dev/null
+++ b/volk/lib/qa_32s_or_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_32s_or_aligned16.h>
+#include <volk/volk_32s_or_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE
+
+void qa_32s_or_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_32s_or_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int32_t input0[vlen] __attribute__ ((aligned (16)));
+  int32_t input1[vlen] __attribute__ ((aligned (16)));
+  
+  int32_t output0[vlen] __attribute__ ((aligned (16)));
+  int32_t output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+    input1[i] = ((int32_t) (rand() - (RAND_MAX/2)));
+  }
+  printf("32s_or_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_or_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32s_or_aligned16_manual(output01, input0, input1, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32s_or_aligned16.h b/volk/lib/qa_32s_or_aligned16.h
new file mode 100644
index 000000000..9e949eb52
--- /dev/null
+++ b/volk/lib/qa_32s_or_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32S_OR_ALIGNED16_H
+#define INCLUDED_QA_32S_OR_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32s_or_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32s_or_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32S_OR_ALIGNED16_H */
diff --git a/volk/lib/qa_32u_byteswap_aligned16.cc b/volk/lib/qa_32u_byteswap_aligned16.cc
new file mode 100644
index 000000000..8b1023876
--- /dev/null
+++ b/volk/lib/qa_32u_byteswap_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_32u_byteswap_aligned16.h>
+#include <volk/volk_32u_byteswap_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_32u_byteswap_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_byteswap_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100001;
+  
+  uint32_t output0[vlen] __attribute__ ((aligned (16)));
+  uint32_t output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    output0[i] = (uint32_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
+  }
+  memcpy(output01, output0, vlen*sizeof(uint32_t));
+  printf("32u_byteswap_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32u_byteswap_aligned16_manual(output0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32u_byteswap_aligned16_manual(output01, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_32u_byteswap_aligned16.h b/volk/lib/qa_32u_byteswap_aligned16.h
new file mode 100644
index 000000000..47bad4c3d
--- /dev/null
+++ b/volk/lib/qa_32u_byteswap_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H
+#define INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_byteswap_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32u_byteswap_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_BYTESWAP_ALIGNED16_H */
diff --git a/volk/lib/qa_32u_popcnt_aligned16.cc b/volk/lib/qa_32u_popcnt_aligned16.cc
new file mode 100644
index 000000000..49fcddeb2
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <volk/volk_32u_popcnt_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_32u_popcnt_aligned16::t1() {
+  printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_32u_popcnt_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+
+  const int ITERS = 10000000;
+  uint32_t input0 __attribute__ ((aligned (16)));
+  
+  uint32_t output0 __attribute__ ((aligned (16)));
+  uint32_t output01 __attribute__ ((aligned (16)));
+
+    input0 = ((uint32_t) (rand() - (RAND_MAX/2)));
+    output0 = 0;
+    output01 = 0;
+
+  printf("32u_popcnt_aligned\n");
+
+  start = clock();
+  uint32_t ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_32u_popcnt_aligned16_manual(&ret, input0, "generic");
+    output0 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_32u_popcnt_aligned16(&ret, input0);
+    output01 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.2_time: %f\n", total);
+
+  
+  CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_32u_popcnt_aligned16.h b/volk/lib/qa_32u_popcnt_aligned16.h
new file mode 100644
index 000000000..fa1dc1041
--- /dev/null
+++ b/volk/lib/qa_32u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_32U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_32u_popcnt_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_32u_popcnt_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_32U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_64f_convert_32f_aligned16.cc b/volk/lib/qa_64f_convert_32f_aligned16.cc
new file mode 100644
index 000000000..0eaebf00a
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_convert_32f_aligned16.h>
+#include <volk/volk_64f_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_convert_32f_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_convert_32f_aligned16::t1() {
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+
+  double input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+  }
+  printf("64f_convert_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_convert_32f_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_convert_32f_aligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_convert_32f_aligned16.h b/volk/lib/qa_64f_convert_32f_aligned16.h
new file mode 100644
index 000000000..95d79f73d
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_convert_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64f_convert_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.cc b/volk/lib/qa_64f_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..dcf94bd27
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_unaligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_convert_32f_unaligned16.h>
+#include <volk/volk_64f_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse2
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_convert_32f_unaligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_convert_32f_unaligned16::t1() {
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+
+  double input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse2[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+  }
+  printf("64f_convert_32f_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_convert_32f_unaligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_convert_32f_unaligned16_manual(output_sse2, input0, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse2[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_convert_32f_unaligned16.h b/volk/lib/qa_64f_convert_32f_unaligned16.h
new file mode 100644
index 000000000..430327e81
--- /dev/null
+++ b/volk/lib/qa_64f_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64f_convert_32f_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_64f_max_aligned16.cc b/volk/lib/qa_64f_max_aligned16.cc
new file mode 100644
index 000000000..41ab078b0
--- /dev/null
+++ b/volk/lib/qa_64f_max_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_max_aligned16.h>
+#include <volk/volk_64f_max_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_max_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_max_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  double input0[vlen] __attribute__ ((aligned (16)));
+  double input1[vlen] __attribute__ ((aligned (16)));
+  
+  double output0[vlen] __attribute__ ((aligned (16)));
+  double output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+    input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+  }
+  printf("64f_max_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_max_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_max_aligned16_manual(output01, input0, input1, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_max_aligned16.h b/volk/lib/qa_64f_max_aligned16.h
new file mode 100644
index 000000000..7cbd4d4c1
--- /dev/null
+++ b/volk/lib/qa_64f_max_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_MAX_ALIGNED16_H
+#define INCLUDED_QA_64F_MAX_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_max_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64f_max_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_MAX_ALIGNED16_H */
diff --git a/volk/lib/qa_64f_min_aligned16.cc b/volk/lib/qa_64f_min_aligned16.cc
new file mode 100644
index 000000000..b4664d065
--- /dev/null
+++ b/volk/lib/qa_64f_min_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_64f_min_aligned16.h>
+#include <volk/volk_64f_min_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64f_min_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64f_min_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  double input0[vlen] __attribute__ ((aligned (16)));
+  double input1[vlen] __attribute__ ((aligned (16)));
+  
+  double output0[vlen] __attribute__ ((aligned (16)));
+  double output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+    input1[i] = ((double) (rand() - (RAND_MAX/2))) / static_cast<double>((RAND_MAX/2));
+  }
+  printf("64f_min_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_min_aligned16_manual(output0, input0, input1, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64f_min_aligned16_manual(output01, input0, input1, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_64f_min_aligned16.h b/volk/lib/qa_64f_min_aligned16.h
new file mode 100644
index 000000000..a0e95395f
--- /dev/null
+++ b/volk/lib/qa_64f_min_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64F_MIN_ALIGNED16_H
+#define INCLUDED_QA_64F_MIN_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64f_min_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64f_min_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64F_MIN_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_byteswap_aligned16.cc b/volk/lib/qa_64u_byteswap_aligned16.cc
new file mode 100644
index 000000000..4f5d4d02b
--- /dev/null
+++ b/volk/lib/qa_64u_byteswap_aligned16.cc
@@ -0,0 +1,59 @@
+#include <volk/volk.h>
+#include <qa_64u_byteswap_aligned16.h>
+#include <volk/volk_64u_byteswap_aligned16.h>
+#include <cstdlib>
+#include <cstring>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE2
+
+void qa_64u_byteswap_aligned16::t1() {
+  printf("sse2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_byteswap_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100001;
+  
+  uint64_t output0[vlen] __attribute__ ((aligned (16)));
+  uint64_t output01[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    output0[i] = (uint64_t) ((rand() - (RAND_MAX/2)) / (RAND_MAX/2));
+  }
+  memcpy(output01, output0, vlen*sizeof(uint64_t));
+  printf("64u_byteswap_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64u_byteswap_aligned16_manual(output0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64u_byteswap_aligned16_manual(output01, vlen, "sse2");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse2_time: %f\n", total);
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_64u_byteswap_aligned16.h b/volk/lib/qa_64u_byteswap_aligned16.h
new file mode 100644
index 000000000..a4fa0c983
--- /dev/null
+++ b/volk/lib/qa_64u_byteswap_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H
+#define INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_byteswap_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64u_byteswap_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_BYTESWAP_ALIGNED16_H */
diff --git a/volk/lib/qa_64u_popcnt_aligned16.cc b/volk/lib/qa_64u_popcnt_aligned16.cc
new file mode 100644
index 000000000..bce9ff6c2
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.cc
@@ -0,0 +1,61 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <volk/volk_64u_popcnt_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_2
+
+void qa_64u_popcnt_aligned16::t1() {
+  printf("sse4.2 not available... no test performed\n");
+}
+
+#else
+
+void qa_64u_popcnt_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+
+  const int ITERS = 10000000;
+  uint64_t input0 __attribute__ ((aligned (16)));
+  
+  uint64_t output0 __attribute__ ((aligned (16)));
+  uint64_t output01 __attribute__ ((aligned (16)));
+
+    input0 = ((uint64_t) (rand() - (RAND_MAX/2)));
+    output0 = 0;
+    output01 = 0;
+
+  printf("64u_popcnt_aligned\n");
+
+  start = clock();
+  uint64_t ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    volk_64u_popcnt_aligned16_manual(&ret, input0, "generic");
+    output0 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  ret = 0;
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_64u_popcnt_aligned16(&ret, input0);
+    output01 += ret;
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.2_time: %f\n", total);
+
+  
+  CPPUNIT_ASSERT_EQUAL(output0, output01);
+}
+
+#endif
diff --git a/volk/lib/qa_64u_popcnt_aligned16.h b/volk/lib/qa_64u_popcnt_aligned16.h
new file mode 100644
index 000000000..217822d6e
--- /dev/null
+++ b/volk/lib/qa_64u_popcnt_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+#define INCLUDED_QA_64U_POPCNT_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_64u_popcnt_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_64u_popcnt_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_64U_POPCNT_ALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_16s_aligned16.cc b/volk/lib/qa_8s_convert_16s_aligned16.cc
new file mode 100644
index 000000000..35f08fb81
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_16s_aligned16.h>
+#include <volk/volk_8s_convert_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse4_1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_16s_aligned16::t1() {
+  printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_16s_aligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int8_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+  }
+  printf("8s_convert_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8s_convert_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8s_convert_16s_aligned16(output_sse4_1, input0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_16s_aligned16.h b/volk/lib/qa_8s_convert_16s_aligned16.h
new file mode 100644
index 000000000..38739fc96
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8s_convert_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.cc b/volk/lib/qa_8s_convert_16s_unaligned16.cc
new file mode 100644
index 000000000..bb326f818
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_unaligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_16s_unaligned16.h>
+#include <volk/volk_8s_convert_16s_unaligned16.h>
+#include <cstdlib>
+
+//test for sse4_1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_16s_unaligned16::t1() {
+  printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_16s_unaligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int8_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+  }
+  printf("8s_convert_16s_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8s_convert_16s_unaligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8s_convert_16s_unaligned16(output_sse4_1, input0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_16s_unaligned16.h b/volk/lib/qa_8s_convert_16s_unaligned16.h
new file mode 100644
index 000000000..d39fffc35
--- /dev/null
+++ b/volk/lib/qa_8s_convert_16s_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_16s_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8s_convert_16s_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_16S_UNALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_32f_aligned16.cc b/volk/lib/qa_8s_convert_32f_aligned16.cc
new file mode 100644
index 000000000..522da0b9d
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_aligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_32f_aligned16.h>
+#include <volk/volk_8s_convert_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse4.1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_32f_aligned16::t1() {
+  printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_32f_aligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int8_t input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+  }
+  printf("8s_convert_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8s_convert_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8s_convert_32f_aligned16(output_sse4_1, input0, 128.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_32f_aligned16.h b/volk/lib/qa_8s_convert_32f_aligned16.h
new file mode 100644
index 000000000..7f8401d42
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8s_convert_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.cc b/volk/lib/qa_8s_convert_32f_unaligned16.cc
new file mode 100644
index 000000000..ea1fb7c74
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_unaligned16.cc
@@ -0,0 +1,63 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8s_convert_32f_unaligned16.h>
+#include <volk/volk_8s_convert_32f_unaligned16.h>
+#include <cstdlib>
+
+//test for sse4.1
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8s_convert_32f_unaligned16::t1() {
+  printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8s_convert_32f_unaligned16::t1() {
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  int8_t input0[vlen+1] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen+1] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen+1] __attribute__ ((aligned (16)));
+
+  for(int i = 0; i < vlen; ++i) {   
+    input0[i] = ((int8_t)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)) * 128.0));
+  }
+  printf("8s_convert_32f_unaligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8s_convert_32f_unaligned16_manual(output_generic, &input0[1], 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8s_convert_32f_unaligned16(output_sse4_1, &input0[1], 128.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%e...%e\n", output_generic[i], output_sse4_1[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8s_convert_32f_unaligned16.h b/volk/lib/qa_8s_convert_32f_unaligned16.h
new file mode 100644
index 000000000..aad2f8c22
--- /dev/null
+++ b/volk/lib/qa_8s_convert_32f_unaligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H
+#define INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8s_convert_32f_unaligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8s_convert_32f_unaligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8S_CONVERT_32F_UNALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc
new file mode 100644
index 000000000..823e7fe2e
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.cc
@@ -0,0 +1,67 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_16s_aligned16.h>
+#include <volk/volk_8sc_deinterleave_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_deinterleave_16s_aligned16::t1() {
+  printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_16s_aligned16::t1() {
+
+  
+  volk_runtime_init();  
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_generic1[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse4_11[vlen] __attribute__ ((aligned (16)));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  printf("8sc_deinterleave_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_16s_aligned16_manual(output_generic, output_generic1, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8sc_deinterleave_16s_aligned16(output_sse4_1, output_sse4_11, input0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_sse4_1[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic1[i],  output_sse4_11[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8sc_deinterleave_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h
new file mode 100644
index 000000000..9c99fed70
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc
new file mode 100644
index 000000000..fb580516c
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.cc
@@ -0,0 +1,134 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_32f_aligned16.h>
+#include <volk/volk_8sc_deinterleave_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifndef LV_HAVE_SSE
+
+void qa_8sc_deinterleave_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_generic1[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse1[vlen] __attribute__ ((aligned (16)));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  printf("8sc_deinterleave_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif /* LV_HAVE_SSE */
+
+#else
+
+void qa_8sc_deinterleave_32f_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_generic1[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+  float output_sse1[vlen] __attribute__ ((aligned (16)));
+  float output_sse4_1[vlen] __attribute__ ((aligned (16)));
+  float output_sse14_1[vlen] __attribute__ ((aligned (16)));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  printf("8sc_deinterleave_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_32f_aligned16_manual(output_generic, output_generic1, input0, 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_32f_aligned16_manual(output_sse, output_sse1, input0, 128.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8sc_deinterleave_32f_aligned16(output_sse4_1, output_sse14_1, input0, 128.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.1_time: %f\n", total);
+
+  for(int i = 0; i < vlen; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("%d generic... %e %e, sse... %e %e sse4.1... %e %e\n", i, output_generic[i], output_generic1[i], output_sse[i], output_sse1[i], output_sse4_1[i], output_sse14_1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i],std::max<double>((output_generic[i])*1e-4, 1e-4));
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse1[i], std::max<double>((output_generic[i])*1e-4, 1e-4));
+
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], std::max<double>((output_generic[i])*1e-4, 1e-4));
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic1[i], output_sse14_1[i], std::max<double>((output_generic[i])*1e-4, 1e-4));
+  }
+}
+
+
+#endif /* LV_HAVE_SSE4_1 */
diff --git a/volk/lib/qa_8sc_deinterleave_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h
new file mode 100644
index 000000000..63b5fdadb
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc
new file mode 100644
index 000000000..1cc844b52
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.cc
@@ -0,0 +1,64 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_real_16s_aligned16.h>
+#include <volk/volk_8sc_deinterleave_real_16s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_deinterleave_real_16s_aligned16::t1() {
+  printf("sse4_1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_real_16s_aligned16::t1() {
+
+  
+  volk_runtime_init();  
+
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int16_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int16_t output_sse4_1[vlen] __attribute__ ((aligned (16)));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  printf("8sc_deinterleave_real_16s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_16s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8sc_deinterleave_real_16s_aligned16(output_sse4_1, input0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4.1_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i],  output_sse4_1[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h
new file mode 100644
index 000000000..02050926f
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_16s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_real_16s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_16s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_16S_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc
new file mode 100644
index 000000000..10e537cde
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.cc
@@ -0,0 +1,138 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_real_32f_aligned16.h>
+#include <volk/volk_8sc_deinterleave_real_32f_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSE4_1
+
+#ifndef LV_HAVE_SSE
+
+void qa_8sc_deinterleave_real_32f_aligned16::t1() {
+  printf("sse not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_real_32f_aligned16::t1() {
+  
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  float output_generic[vlen] __attribute__ ((aligned (16)));
+  float output_sse[vlen] __attribute__ ((aligned (16)));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  printf("8sc_deinterleave_real_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 32768.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 32768.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+  }
+}
+
+#endif /* LV_HAVE_SSE */
+
+#else
+
+void qa_8sc_deinterleave_real_32f_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> *input0;
+  
+  float* output_generic;
+  float* output_sse;
+  float* output_sse4_1;
+
+  ret = posix_memalign((void**)&input0, 16, 2*vlen * sizeof(int8_t));
+  ret = posix_memalign((void**)&output_generic, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&output_sse, 16, vlen * sizeof(float));
+  ret = posix_memalign((void**)&output_sse4_1, 16, vlen * sizeof(float));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)(((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0);
+  }
+
+  printf("8sc_deinterleave_real_32f_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_32f_aligned16_manual(output_generic, input0, 128.0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_32f_aligned16_manual(output_sse, input0, 1288.0, vlen, "sse");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse_time: %f\n", total);
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8sc_deinterleave_real_32f_aligned16(output_sse4_1, input0, 128.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse[i], fabs(output_generic[i])*1e-4);
+    CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse4_1[i], fabs(output_generic[i])*1e-4);
+  }
+
+  free(input0);
+  free(output_generic);
+  free(output_sse);
+  free(output_sse4_1);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
diff --git a/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h
new file mode 100644
index 000000000..93338e488
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_32f_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_real_32f_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_32f_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_32F_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc
new file mode 100644
index 000000000..d84df8119
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.cc
@@ -0,0 +1,60 @@
+#include <volk/volk.h>
+#include <qa_8sc_deinterleave_real_8s_aligned16.h>
+#include <volk/volk_8sc_deinterleave_real_8s_aligned16.h>
+#include <cstdlib>
+
+//test for sse
+
+#ifndef LV_HAVE_SSSE3
+
+void qa_8sc_deinterleave_real_8s_aligned16::t1() {
+  printf("ssse3 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_deinterleave_real_8s_aligned16::t1() {
+  
+  volk_environment_init();
+  clock_t start, end;
+  double total;
+  const int vlen = 3201;
+  const int ITERS = 100000;
+  std::complex<int8_t> input0[vlen] __attribute__ ((aligned (16)));
+  
+  int8_t output_generic[vlen] __attribute__ ((aligned (16)));
+  int8_t output_ssse3[vlen] __attribute__ ((aligned (16)));
+
+  int8_t* loadInput = (int8_t*)input0;
+  for(int i = 0; i < vlen*2; ++i) {   
+    loadInput[i] =((char)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  printf("8sc_deinterleave_real_8s_aligned\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_8s_aligned16_manual(output_generic, input0, vlen, "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_deinterleave_real_8s_aligned16_manual(output_ssse3, input0, vlen, "ssse3");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("ssse3_time: %f\n", total);
+
+  for(int i = 0; i < 1; ++i) {
+    //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
+    //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
+  }
+  
+  for(int i = 0; i < vlen; ++i) {
+    //printf("%d...%d\n", output0[i], output01[i]);
+    CPPUNIT_ASSERT_EQUAL(output_generic[i], output_ssse3[i]);
+  }
+}
+
+#endif
diff --git a/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h
new file mode 100644
index 000000000..92fc0dd4a
--- /dev/null
+++ b/volk/lib/qa_8sc_deinterleave_real_8s_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+#define INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_deinterleave_real_8s_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_deinterleave_real_8s_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_DEINTERLEAVE_REAL_8S_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc
new file mode 100644
index 000000000..d64eac8ce
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.cc
@@ -0,0 +1,87 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_multiply_conjugate_16sc_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-4)
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_multiply_conjugate_16sc_aligned16::t1() {
+  printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_multiply_conjugate_16sc_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  const int vlen = 2046;
+  const int ITERS = 100000;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<int8_t>* input;
+  std::complex<int8_t>* taps;
+  
+  std::complex<int16_t>* result_generic;
+  std::complex<int16_t>* result_sse4_1;
+  int i;
+  int8_t* inputInt8_T;
+  int8_t* tapsInt8_T;
+
+  ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t));
+  ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t));
+  ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(int16_t));
+  ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(int16_t));
+  
+  inputInt8_T = (int8_t*)input;
+  tapsInt8_T = (int8_t*)taps;
+  for(int i = 0; i < vlen*2; ++i) {   
+    inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+    tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  
+  printf("8sc_multiply_conjugate_16sc_aligned16\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_multiply_conjugate_16sc_aligned16_manual((std::complex<int16_t>*)result_generic, (std::complex<int8_t>*)input, (std::complex<int8_t>*)taps, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8sc_multiply_conjugate_16sc_aligned16((std::complex<int16_t>*)result_sse4_1, (std::complex<int8_t>*)input, (std::complex<int8_t>*)taps, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(i = 0; i < vlen; i++){
+    //printf("%d %d+%di %d+%di -> %d+%di %d+%di\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i]));
+
+    assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse4_1);
+  
+}
+
+#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h
new file mode 100644
index 000000000..0e78a5eca
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_16sc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H
+#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_multiply_conjugate_16sc_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_16sc_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_16SC_ALIGNED16_H */
diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc
new file mode 100644
index 000000000..c27f0e0ca
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.cc
@@ -0,0 +1,87 @@
+#include <volk/volk_runtime.h>
+#include <volk/volk.h>
+#include <qa_8sc_multiply_conjugate_32fc_aligned16.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+
+#define assertcomplexEqual(expected, actual, delta)			\
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::real(expected), std::real(actual), fabs(std::real(expected)) * delta); \
+  CPPUNIT_ASSERT_DOUBLES_EQUAL (std::imag(expected), std::imag(actual), fabs(std::imag(expected))* delta);	
+
+#define	ERR_DELTA	(1e-4)
+
+#ifndef LV_HAVE_SSE4_1
+
+void qa_8sc_multiply_conjugate_32fc_aligned16::t1() {
+  printf("sse4.1 not available... no test performed\n");
+}
+
+#else
+
+void qa_8sc_multiply_conjugate_32fc_aligned16::t1() {
+  
+  
+  volk_runtime_init();
+
+  const int vlen = 2046;
+  const int ITERS = 100000;
+
+  volk_environment_init();
+  int ret;
+  clock_t start, end;
+  double total;
+  std::complex<int8_t>* input;
+  std::complex<int8_t>* taps;
+  
+  std::complex<float>* result_generic;
+  std::complex<float>* result_sse4_1;
+  int i;
+  int8_t* inputInt8_T;
+  int8_t* tapsInt8_T;
+
+  ret = posix_memalign((void**)&input, 16, vlen*2*sizeof(int8_t));
+  ret = posix_memalign((void**)&taps, 16, vlen*2*sizeof(int8_t));
+  ret = posix_memalign((void**)&result_generic, 16, vlen*2*sizeof(float));
+  ret = posix_memalign((void**)&result_sse4_1, 16, vlen*2*sizeof(float));
+  
+
+  inputInt8_T = (int8_t*)input;
+  tapsInt8_T = (int8_t*)taps;
+  for(int i = 0; i < vlen*2; ++i) {   
+    inputInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+    tapsInt8_T[i] =((int8_t)((((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * 128.0));
+  }
+  
+  printf("8sc_multiply_conjugate_32fc_aligned16\n");
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    volk_8sc_multiply_conjugate_32fc_aligned16_manual(result_generic, (const std::complex<int8_t>*)input, (const std::complex<int8_t>*)taps, 32768.0, vlen,  "generic");
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("generic_time: %f\n", total);
+
+
+  start = clock();
+  for(int count = 0; count < ITERS; ++count) {
+    get_volk_runtime()->volk_8sc_multiply_conjugate_32fc_aligned16(result_sse4_1, (const std::complex<int8_t>*)input, (const std::complex<int8_t>*)taps, 32768.0, vlen);
+  }
+  end = clock();
+  total = (double)(end-start)/(double)CLOCKS_PER_SEC;
+  printf("sse4_1_time: %f\n", total);
+
+  for(i = 0; i < vlen; i++){
+    //printf("%d %d+%di %d+%di -> %e+%ei %e+%ei\n", i, std::real(input[i]), std::imag(input[i]), std::real(taps[i]), std::imag(taps[i]), std::real(result_generic[i]), std::imag(result_generic[i]), std::real(result_sse4_1[i]), std::imag(result_sse4_1[i]));
+    assertcomplexEqual(result_generic[i], result_sse4_1[i], ERR_DELTA);
+  }
+
+  free(input);
+  free(taps);
+  free(result_generic);
+  free(result_sse4_1);
+  
+}
+
+#endif /*LV_HAVE_SSE4_1*/
diff --git a/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h
new file mode 100644
index 000000000..eb9ae309c
--- /dev/null
+++ b/volk/lib/qa_8sc_multiply_conjugate_32fc_aligned16.h
@@ -0,0 +1,18 @@
+#ifndef INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H
+#define INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <cppunit/TestCase.h>
+
+class qa_8sc_multiply_conjugate_32fc_aligned16 : public CppUnit::TestCase {
+
+  CPPUNIT_TEST_SUITE (qa_8sc_multiply_conjugate_32fc_aligned16);
+  CPPUNIT_TEST (t1);
+  CPPUNIT_TEST_SUITE_END ();
+
+ private:
+  void t1 ();
+};
+
+
+#endif /* INCLUDED_QA_8SC_MULTIPLY_CONJUGATE_32FC_ALIGNED16_H */
diff --git a/volk/lib/qa_volk.cc b/volk/lib/qa_volk.cc
new file mode 100644
index 000000000..c3c27b69b
--- /dev/null
+++ b/volk/lib/qa_volk.cc
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This class gathers together all the test cases for the example
+ * directory into a single test suite.  As you create new test cases,
+ * add them here.
+ */
+
+#include <qa_volk.h>
+#include <qa_16s_quad_max_star_aligned16.h>
+#include <qa_32fc_dot_prod_aligned16.h>
+#include <qa_32fc_square_dist_aligned16.h>
+#include <qa_32fc_square_dist_scalar_mult_aligned16.h>
+#include <qa_32f_sum_of_poly_aligned16.h>
+#include <qa_32fc_index_max_aligned16.h>
+#include <qa_32f_index_max_aligned16.h>
+#include <qa_32fc_conjugate_dot_prod_aligned16.h>
+#include <qa_16s_permute_and_scalar_add_aligned16.h>
+#include <qa_16s_branch_4_state_8_aligned16.h>
+#include <qa_16s_max_star_horizontal_aligned16.h>
+#include <qa_16s_max_star_aligned16.h>
+#include <qa_16s_add_quad_aligned16.h>
+#include <qa_32f_add_aligned16.h>
+#include <qa_32f_subtract_aligned16.h>
+#include <qa_32f_max_aligned16.h>
+#include <qa_32f_min_aligned16.h>
+#include <qa_64f_max_aligned16.h>
+#include <qa_64f_min_aligned16.h>
+#include <qa_32s_and_aligned16.h>
+#include <qa_32s_or_aligned16.h>
+#include <qa_32f_dot_prod_aligned16.h>
+#include <qa_32f_dot_prod_unaligned16.h>
+#include <qa_32f_fm_detect_aligned16.h>
+#include <qa_32fc_32f_multiply_aligned16.h>
+#include <qa_32fc_multiply_aligned16.h>
+#include <qa_32f_divide_aligned16.h>
+#include <qa_32f_multiply_aligned16.h>
+#include <qa_32f_sqrt_aligned16.h>
+#include <qa_8sc_multiply_conjugate_16sc_aligned16.h>
+#include <qa_8sc_multiply_conjugate_32fc_aligned16.h>
+#include <qa_32u_popcnt_aligned16.h>
+#include <qa_64u_popcnt_aligned16.h>
+#include <qa_16u_byteswap_aligned16.h>
+#include <qa_32u_byteswap_aligned16.h>
+#include <qa_64u_byteswap_aligned16.h>
+#include <qa_32f_normalize_aligned16.h>
+#include <qa_16sc_deinterleave_16s_aligned16.h>
+#include <qa_16sc_deinterleave_32f_aligned16.h>
+#include <qa_16sc_deinterleave_real_16s_aligned16.h>
+#include <qa_16sc_deinterleave_real_32f_aligned16.h>
+#include <qa_16sc_deinterleave_real_8s_aligned16.h>
+#include <qa_16sc_magnitude_16s_aligned16.h>
+#include <qa_16sc_magnitude_32f_aligned16.h>
+#include <qa_32fc_deinterleave_32f_aligned16.h>
+#include <qa_32fc_deinterleave_64f_aligned16.h>
+#include <qa_32fc_deinterleave_real_16s_aligned16.h>
+#include <qa_32fc_deinterleave_real_32f_aligned16.h>
+#include <qa_32fc_deinterleave_real_64f_aligned16.h>
+#include <qa_32fc_magnitude_16s_aligned16.h>
+#include <qa_32fc_magnitude_32f_aligned16.h>
+#include <qa_32f_interleave_16sc_aligned16.h>
+#include <qa_32f_interleave_32fc_aligned16.h>
+#include <qa_8sc_deinterleave_16s_aligned16.h>
+#include <qa_8sc_deinterleave_32f_aligned16.h>
+#include <qa_8sc_deinterleave_real_16s_aligned16.h>
+#include <qa_8sc_deinterleave_real_32f_aligned16.h>
+#include <qa_8sc_deinterleave_real_8s_aligned16.h>
+#include <qa_16s_convert_32f_aligned16.h>
+#include <qa_16s_convert_32f_unaligned16.h>
+#include <qa_16s_convert_8s_aligned16.h>
+#include <qa_16s_convert_8s_unaligned16.h>
+#include <qa_32f_convert_16s_aligned16.h>
+#include <qa_32f_convert_16s_unaligned16.h>
+#include <qa_32f_convert_32s_aligned16.h>
+#include <qa_32f_convert_32s_unaligned16.h>
+#include <qa_32f_convert_64f_aligned16.h>
+#include <qa_32f_convert_64f_unaligned16.h>
+#include <qa_32f_convert_8s_aligned16.h>
+#include <qa_32f_convert_8s_unaligned16.h>
+#include <qa_32s_convert_32f_aligned16.h>
+#include <qa_32s_convert_32f_unaligned16.h>
+#include <qa_64f_convert_32f_aligned16.h>
+#include <qa_64f_convert_32f_unaligned16.h>
+#include <qa_8s_convert_16s_aligned16.h>
+#include <qa_8s_convert_16s_unaligned16.h>
+#include <qa_8s_convert_32f_aligned16.h>
+#include <qa_8s_convert_32f_unaligned16.h>
+#include <qa_32fc_32f_power_32fc_aligned16.h>
+#include <qa_32f_power_aligned16.h>
+#include <qa_32fc_atan2_32f_aligned16.h>
+#include <qa_32fc_power_spectral_density_32f_aligned16.h> 
+#include <qa_32fc_power_spectrum_32f_aligned16.h>
+#include <qa_32f_calc_spectral_noise_floor_aligned16.h>
+#include <qa_32f_accumulator_aligned16.h>
+#include <qa_32f_stddev_aligned16.h>
+#include <qa_32f_stddev_and_mean_aligned16.h>
+
+CppUnit::TestSuite *
+qa_volk::suite()
+{
+  CppUnit::TestSuite *s = new CppUnit::TestSuite("volk");
+
+  s->addTest(qa_16s_quad_max_star_aligned16::suite());
+  s->addTest(qa_32fc_dot_prod_aligned16::suite());
+  s->addTest(qa_32fc_square_dist_scalar_mult_aligned16::suite());
+  s->addTest(qa_32fc_square_dist_aligned16::suite());
+  s->addTest(qa_32f_sum_of_poly_aligned16::suite());
+  s->addTest(qa_32fc_index_max_aligned16::suite());
+  s->addTest(qa_32f_index_max_aligned16::suite());
+  s->addTest(qa_32fc_conjugate_dot_prod_aligned16::suite());
+  s->addTest(qa_16s_permute_and_scalar_add_aligned16::suite());
+  s->addTest(qa_16s_branch_4_state_8_aligned16::suite());
+  s->addTest(qa_16s_max_star_horizontal_aligned16::suite());
+  s->addTest(qa_16s_max_star_aligned16::suite());
+  s->addTest(qa_16s_add_quad_aligned16::suite());
+  s->addTest(qa_32f_add_aligned16::suite());
+  s->addTest(qa_32f_subtract_aligned16::suite());
+  s->addTest(qa_32f_max_aligned16::suite());
+  s->addTest(qa_32f_min_aligned16::suite());
+  s->addTest(qa_64f_max_aligned16::suite());
+  s->addTest(qa_64f_min_aligned16::suite());
+  s->addTest(qa_32s_and_aligned16::suite());
+  s->addTest(qa_32s_or_aligned16::suite());
+  s->addTest(qa_32f_dot_prod_aligned16::suite());
+  s->addTest(qa_32f_dot_prod_unaligned16::suite());
+  s->addTest(qa_32f_fm_detect_aligned16::suite());
+  s->addTest(qa_32fc_32f_multiply_aligned16::suite());
+  s->addTest(qa_32fc_multiply_aligned16::suite());
+  s->addTest(qa_32f_divide_aligned16::suite());
+  s->addTest(qa_32f_multiply_aligned16::suite());
+  s->addTest(qa_32f_sqrt_aligned16::suite());
+  s->addTest(qa_8sc_multiply_conjugate_16sc_aligned16::suite());
+  s->addTest(qa_8sc_multiply_conjugate_32fc_aligned16::suite());
+  s->addTest(qa_32u_popcnt_aligned16::suite());
+  s->addTest(qa_64u_popcnt_aligned16::suite());
+  s->addTest(qa_16u_byteswap_aligned16::suite());
+  s->addTest(qa_32u_byteswap_aligned16::suite());
+  s->addTest(qa_64u_byteswap_aligned16::suite());
+  s->addTest(qa_32f_normalize_aligned16::suite());
+  s->addTest(qa_16sc_deinterleave_16s_aligned16::suite());
+  s->addTest(qa_16sc_deinterleave_32f_aligned16::suite());
+  s->addTest(qa_16sc_deinterleave_real_16s_aligned16::suite());
+  s->addTest(qa_16sc_deinterleave_real_32f_aligned16::suite());
+  s->addTest(qa_16sc_deinterleave_real_8s_aligned16::suite());
+  s->addTest(qa_16sc_magnitude_16s_aligned16::suite());
+  s->addTest(qa_16sc_magnitude_32f_aligned16::suite());
+  s->addTest(qa_32fc_deinterleave_32f_aligned16::suite());
+  s->addTest(qa_32fc_deinterleave_64f_aligned16::suite());
+  s->addTest(qa_32fc_deinterleave_real_16s_aligned16::suite());
+  s->addTest(qa_32fc_deinterleave_real_32f_aligned16::suite());
+  s->addTest(qa_32fc_deinterleave_real_64f_aligned16::suite());
+  s->addTest(qa_32fc_magnitude_16s_aligned16::suite());
+  s->addTest(qa_32fc_magnitude_32f_aligned16::suite());
+  s->addTest(qa_32f_interleave_16sc_aligned16::suite());
+  s->addTest(qa_32f_interleave_32fc_aligned16::suite());
+  s->addTest(qa_8sc_deinterleave_16s_aligned16::suite());
+  s->addTest(qa_8sc_deinterleave_32f_aligned16::suite());
+  s->addTest(qa_8sc_deinterleave_real_16s_aligned16::suite());
+  s->addTest(qa_8sc_deinterleave_real_32f_aligned16::suite());
+  s->addTest(qa_8sc_deinterleave_real_8s_aligned16::suite());
+  s->addTest(qa_16s_convert_32f_aligned16::suite());
+  s->addTest(qa_16s_convert_32f_unaligned16::suite());
+  s->addTest(qa_16s_convert_8s_aligned16::suite());
+  s->addTest(qa_16s_convert_8s_unaligned16::suite());
+  s->addTest(qa_32f_convert_16s_aligned16::suite());
+  s->addTest(qa_32f_convert_16s_unaligned16::suite());
+  s->addTest(qa_32f_convert_32s_aligned16::suite());
+  s->addTest(qa_32f_convert_32s_unaligned16::suite());
+  s->addTest(qa_32f_convert_64f_aligned16::suite());
+  s->addTest(qa_32f_convert_64f_unaligned16::suite());
+  s->addTest(qa_32f_convert_8s_aligned16::suite());
+  s->addTest(qa_32f_convert_8s_unaligned16::suite());
+  s->addTest(qa_32s_convert_32f_aligned16::suite());
+  s->addTest(qa_32s_convert_32f_unaligned16::suite());
+  s->addTest(qa_64f_convert_32f_aligned16::suite());
+  s->addTest(qa_64f_convert_32f_unaligned16::suite());
+  s->addTest(qa_8s_convert_16s_aligned16::suite());
+  s->addTest(qa_8s_convert_16s_unaligned16::suite());
+  s->addTest(qa_8s_convert_32f_aligned16::suite());
+  s->addTest(qa_8s_convert_32f_unaligned16::suite());
+  s->addTest(qa_32fc_32f_power_32fc_aligned16::suite());
+  s->addTest(qa_32f_power_aligned16::suite());
+  s->addTest(qa_32fc_atan2_32f_aligned16::suite());
+  s->addTest(qa_32fc_power_spectral_density_32f_aligned16::suite());
+  s->addTest(qa_32fc_power_spectrum_32f_aligned16::suite());
+  s->addTest(qa_32f_calc_spectral_noise_floor_aligned16::suite());
+  s->addTest(qa_32f_accumulator_aligned16::suite());
+  s->addTest(qa_32f_stddev_aligned16::suite());
+  s->addTest(qa_32f_stddev_and_mean_aligned16::suite());
+
+  return s;
+}
diff --git a/volk/lib/qa_volk.h b/volk/lib/qa_volk.h
new file mode 100644
index 000000000..43fa7faba
--- /dev/null
+++ b/volk/lib/qa_volk.h
@@ -0,0 +1,36 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Example Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Example Public License for more details.
+ * 
+ * You should have received a copy of the GNU Example Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef INCLUDED_QA_VOLK_H
+#define INCLUDED_QA_VOLK_H
+
+#include <cppunit/TestSuite.h>
+
+//! collect all the tests for the example directory
+
+class qa_volk {
+ public:
+  //! return suite of tests for all of example directory
+  static CppUnit::TestSuite *suite ();
+};
+
+#endif /* INCLUDED_QA_VOLK_H */
diff --git a/volk/lib/test_all.cc b/volk/lib/test_all.cc
new file mode 100644
index 000000000..50ac08eab
--- /dev/null
+++ b/volk/lib/test_all.cc
@@ -0,0 +1,82 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2002,2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#include <cppunit/ui/text/TestRunner.h>
+#include <cppunit/TextTestRunner.h>
+
+#include <qa_volk.h>
+
+#include <cppunit/XmlOutputter.h>
+#include <iostream>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string>
+#include <fstream>
+
+int 
+main (int argc, char **argv)
+{
+  
+  int opt = 0;
+  std::string xmlOutputFile("");
+
+  while( (opt = getopt(argc, argv, "o:")) != -1){
+    switch(opt){
+    case 'o':
+      if(optarg){
+	xmlOutputFile.assign(optarg);
+      }
+      else{
+	std::cerr << "No xml file output specified for -o" << std::endl;
+	exit(EXIT_FAILURE);
+      }
+      break;
+
+    default: /* '?' */
+      fprintf(stderr, "Usage: %s [-o] \"xml output file\"\n",
+	      argv[0]);
+      exit(EXIT_FAILURE);
+    }
+
+  }
+
+  CppUnit::TextUi::TestRunner runner;
+
+  runner.addTest (qa_volk::suite ());
+
+  bool was_successful = false;
+  if(!xmlOutputFile.empty()){
+    std::ofstream xmlOutput(xmlOutputFile.c_str());
+    if(xmlOutput.is_open()){
+      runner.setOutputter(new CppUnit::XmlOutputter(&runner.result(), xmlOutput));
+
+      was_successful = runner.run("", false, true, false);
+    }
+    xmlOutput.close();
+  }
+  else{
+    was_successful = runner.run ("", false);
+  }
+
+  return was_successful ? 0 : 1;
+}
diff --git a/volk/lib/volk_rank_archs.c b/volk/lib/volk_rank_archs.c
new file mode 100644
index 000000000..b1a93db26
--- /dev/null
+++ b/volk/lib/volk_rank_archs.c
@@ -0,0 +1,13 @@
+#include<volk_rank_archs.h>
+#include<stdio.h>
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch) {
+  int i = 2;
+  unsigned int best_val = 0;
+  for(; i < arch_defs[0] + 1; ++i) {
+    if((arch_defs[i]&(!arch)) == 0) {
+      best_val = (arch_defs[i] > arch_defs[best_val + 1]) ? i-1 : best_val;
+    }
+  }
+  return best_val;
+}
diff --git a/volk/lib/volk_rank_archs.h b/volk/lib/volk_rank_archs.h
new file mode 100644
index 000000000..26b9f7503
--- /dev/null
+++ b/volk/lib/volk_rank_archs.h
@@ -0,0 +1,14 @@
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int volk_rank_archs(const int* arch_defs, unsigned int arch);
+ 
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/
diff --git a/volk/libvector_replace.sh b/volk/libvector_replace.sh
new file mode 100755
index 000000000..4c7e33e1b
--- /dev/null
+++ b/volk/libvector_replace.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+cd $1
+files=`ls`
+for file in $files 
+do 
+    sed 's/libvector/volk/g' < $file > tempfile
+    sed 's/LIBVECTOR/VOLK/g' < tempfile > $file
+done
+for file in $files
+do
+    echo $file > tempfile
+    newfile=`sed 's/libvector/volk/g' < tempfile`
+    if (test "$file" != "$newfile"); then
+	mv $file $newfile
+	echo "changed $file to $newfile"
+    fi
+done
diff --git a/volk/python/Makefile.am b/volk/python/Makefile.am
new file mode 100644
index 000000000..8485c4793
--- /dev/null
+++ b/volk/python/Makefile.am
@@ -0,0 +1,95 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+include $(top_srcdir)/Makefile.common
+
+AM_CPPFLAGS = $(STD_DEFINES_AND_INCLUDES) $(PYTHON_CPPFLAGS)
+SWIGPYTHONARGS = $(SWIGPYTHONFLAGS) $(SWIGGRFLAGS)
+
+BUILT_SOURCES = $(swig_built_sources)
+
+MOSTLYCLEANFILES = $(BUILT_SOURCES) *.pyc
+
+# Don't distribute the output of swig
+dist-hook:
+	@for file in $(swig_built_sources); do echo $(RM) $(distdir)/$$file; done
+	@for file in $(swig_built_sources); do $(RM) $(distdir)/$$file; done
+
+
+# if we're not using python, ignore the rest...
+
+TESTS = 
+
+
+ALL_IFILES = \
+	$(LOCAL_IFILES)	\
+	$(NON_LOCAL_IFILES)
+
+NON_LOCAL_IFILES = \
+	$(GNURADIO_CORE_INCLUDEDIR)/swig/gnuradio.i
+
+LOCAL_IFILES = \
+	libvector.i \
+	libvector_square_ff.i
+
+
+# These files are built by SWIG.  The first is the C++ glue.
+# The second is the python wrapper that loads the _libvector shared library
+# and knows how to call our extensions.
+
+swig_built_sources = \
+	libvector_swig.cc \
+	libvector_swig.py
+
+# This gets libvector.py installed in the right place.
+# Any hand-written python should be listed here too.
+ourpython_PYTHON = \
+	__init__.py \
+	libvector_swig.py
+
+# Python QA code gets listed here.  It's not installed.
+noinst_PYTHON = 
+
+# This library contains the swig generated glue
+ourpyexec_LTLIBRARIES = _libvector_swig.la
+
+# These are the source files that go into the shared library
+_libvector_swig_la_SOURCES = \
+	libvector_swig.cc
+
+# Dummy C++ source to cause C++ linking
+nodist_EXTRA__libvector_swig_la_SOURCES = dummy.cc
+
+# magic flags
+_libvector_swig_la_LDFLAGS = $(NO_UNDEFINED) -module -avoid-version
+
+# Link the swig generated code against our library and some python magic.
+_libvector_swig_la_LIBADD = \
+	../lib/liblibvector.la \
+	$(PYTHON_LDFLAGS)
+
+
+libvector_swig.cc libvector_swig.py: $(LOCAL_IFILES) $(ALL_IFILES)
+	$(SWIG) $(SWIGPYTHONARGS) -module libvector_swig -o libvector_swig.cc $(srcdir)/libvector.i
+
+
+# These swig headers get installed in ${prefix}/include/<component>/swig
+ourswiginclude_HEADERS = \
+	$(LOCAL_IFILES)
diff --git a/volk/python/__init__.py b/volk/python/__init__.py
new file mode 100644
index 000000000..6513a31dd
--- /dev/null
+++ b/volk/python/__init__.py
@@ -0,0 +1,53 @@
+#
+# Copyright 2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+
+# The presence of this file turns this directory into a Python package
+
+# ----------------------------------------------------------------
+# Temporary workaround for ticket:181 (swig+python problem)
+import sys
+_RTLD_GLOBAL = 0
+try:
+    from dl import RTLD_GLOBAL as _RTLD_GLOBAL
+except ImportError:
+    try:
+	from DLFCN import RTLD_GLOBAL as _RTLD_GLOBAL
+    except ImportError:
+	pass
+    
+if _RTLD_GLOBAL != 0:
+    _dlopenflags = sys.getdlopenflags()
+    sys.setdlopenflags(_dlopenflags|_RTLD_GLOBAL)
+# ----------------------------------------------------------------
+
+
+# import swig generated symbols into the libvector namespace
+from libvector_swig import *
+
+# import any pure python here
+# from libvector_foo import bar
+# from libvector_baz import *
+
+
+# ----------------------------------------------------------------
+# Tail of workaround
+if _RTLD_GLOBAL != 0:
+    sys.setdlopenflags(_dlopenflags)      # Restore original flags
+# ----------------------------------------------------------------
diff --git a/volk/python/libvector.i b/volk/python/libvector.i
new file mode 100644
index 000000000..598c20a2a
--- /dev/null
+++ b/volk/python/libvector.i
@@ -0,0 +1,47 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+%feature("autodoc", "1");		// generate python docstrings
+
+%include "exception.i"
+%import "gnuradio.i"			// the common stuff
+
+%{
+#include "gnuradio_swig_bug_workaround.h"	// mandatory bug fix
+#include <stdexcept>
+%}
+
+// ----------------------------------------------------------------
+
+/*
+ * Gather all .i files in this directory together.
+ */
+
+%{
+
+// The .h files
+#include <libvector/libvector_square_ff.h>
+
+%}
+
+// The .i files
+%include <libvector_square_ff.i>
+
diff --git a/volk/python/libvector_square_ff.i b/volk/python/libvector_square_ff.i
new file mode 100644
index 000000000..a1c547cc7
--- /dev/null
+++ b/volk/python/libvector_square_ff.i
@@ -0,0 +1,37 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * First arg is the package prefix.
+ * Second arg is the name of the class minus the prefix.
+ *
+ * This does some behind-the-scenes magic so we can
+ * access gr_example_square_ff from python as howto.square_ff
+ */
+GR_SWIG_BLOCK_MAGIC(libvector,square_ff);
+
+libvector_square_ff_sptr libvector_make_square_ff ();
+
+class libvector_square_ff : public gr_sync_block
+{
+private:
+  libvector_square_ff();
+};
diff --git a/volk/python/qa_square.py b/volk/python/qa_square.py
new file mode 100755
index 000000000..1ee56411f
--- /dev/null
+++ b/volk/python/qa_square.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+#
+# Copyright 2004,2007,2008 Free Software Foundation, Inc.
+# 
+# This file is part of GNU Radio
+# 
+# GNU Radio is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+# 
+# GNU Radio is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with GNU Radio; see the file COPYING.  If not, write to
+# the Free Software Foundation, Inc., 51 Franklin Street,
+# Boston, MA 02110-1301, USA.
+# 
+
+from gnuradio import gr, gr_unittest
+import libvector_swig as libvector
+
+class qa_libvector(gr_unittest.TestCase):
+
+    def setUp(self):
+        self.tb = gr.top_block()
+
+    def tearDown(self):
+        self.tb = None
+
+    def test_001_square_ff(self):
+        src_data = (-3, 4, -5.5, 2, 3)
+        expected_result = (9, 16, 30.25, 4, 9)
+        src = gr.vector_source_f(src_data)
+        sqr = libvector.square_ff()
+        dst = gr.vector_sink_f()
+        self.tb.connect(src, sqr)
+        self.tb.connect(sqr, dst)
+        self.tb.run()
+        result_data = dst.data()
+        self.assertFloatTuplesAlmostEqual(expected_result, result_data, 6)
+        
+if __name__ == '__main__':
+    gr_unittest.main()
diff --git a/volk/python/run_tests.in b/volk/python/run_tests.in
new file mode 100644
index 000000000..895a849bd
--- /dev/null
+++ b/volk/python/run_tests.in
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+# All this strange PYTHONPATH manipulation is required to run our
+# tests using our just built shared library and swig-generated python
+# code prior to installation.
+
+# build tree == src tree unless you're doing a VPATH build.  
+# If you don't know what a VPATH build is, you're not doing one.  Relax...
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+
+# Where to look in the build tree for our shared library
+libbld=@abs_top_builddir@/python
+# Where to look in the src tree for swig generated python code
+libsrc=@abs_top_builddir@/python
+# Where to look in the src tree for hand written python code
+py=@abs_top_srcdir@/python
+
+# Where to look for installed GNU Radio python modules
+# FIXME this is wrong on a distcheck.  We really need to ask gnuradio-core
+# where it put its python files.
+installed_pythondir=@pythondir@
+installed_pyexecdir=@pyexecdir@
+
+PYTHONPATH="$libbld:$libbld/.libs:$libsrc:$py:$installed_pythondir:$installed_pyexecdir:$PYTHONPATH"
+#PYTHONPATH="$libbld:$libbld/.libs:$libsrc:$py:$installed_pythondir:$installed_pyexecdir"
+
+export PYTHONPATH
+
+#
+# This is the simple part...
+# Run everything that matches qa_*.py and return the final result.
+#
+
+ok=yes
+for file in @srcdir@/qa_*.py
+do
+  if ! $file
+  then
+    ok=no
+  fi  
+done
+
+if [ $ok = yes ]
+then
+  exit 0
+else
+  exit 1
+fi
diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h
new file mode 100644
index 000000000..8e3e3f2a6
--- /dev/null
+++ b/volk/spu_lib/gc_spu_macs.h
@@ -0,0 +1,380 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef INCLUDED_GC_SPU_MACS_H
+#define INCLUDED_GC_SPU_MACS_H
+
+/*
+ * This file contains a set of macros that are generally useful when
+ * coding in SPU assembler
+ *
+ * Note that the multi-instruction macros in here may overwrite
+ * registers 77, 78, and 79 without warning.
+ */
+
+/*
+ * defines for all registers
+ */
+#define r0	$0
+#define r1	$1
+#define r2	$2
+#define r3	$3
+#define r4	$4
+#define r5	$5
+#define r6	$6
+#define r7	$7
+#define r8	$8
+#define r9	$9
+#define r10	$10
+#define r11	$11
+#define r12	$12
+#define r13	$13
+#define r14	$14
+#define r15	$15
+#define r16	$16
+#define r17	$17
+#define r18	$18
+#define r19	$19
+#define r20	$20
+#define r21	$21
+#define r22	$22
+#define r23	$23
+#define r24	$24
+#define r25	$25
+#define r26	$26
+#define r27	$27
+#define r28	$28
+#define r29	$29
+#define r30	$30
+#define r31	$31
+#define r32	$32
+#define r33	$33
+#define r34	$34
+#define r35	$35
+#define r36	$36
+#define r37	$37
+#define r38	$38
+#define r39	$39
+#define r40	$40
+#define r41	$41
+#define r42	$42
+#define r43	$43
+#define r44	$44
+#define r45	$45
+#define r46	$46
+#define r47	$47
+#define r48	$48
+#define r49	$49
+#define r50	$50
+#define r51	$51
+#define r52	$52
+#define r53	$53
+#define r54	$54
+#define r55	$55
+#define r56	$56
+#define r57	$57
+#define r58	$58
+#define r59	$59
+#define r60	$60
+#define r61	$61
+#define r62	$62
+#define r63	$63
+#define r64	$64
+#define r65	$65
+#define r66	$66
+#define r67	$67
+#define r68	$68
+#define r69	$69
+#define r70	$70
+#define r71	$71
+#define r72	$72
+#define r73	$73
+#define r74	$74
+#define r75	$75
+#define r76	$76
+#define r77	$77
+#define r78	$78
+#define r79	$79
+#define r80	$80
+#define r81	$81
+#define r82	$82
+#define r83	$83
+#define r84	$84
+#define r85	$85
+#define r86	$86
+#define r87	$87
+#define r88	$88
+#define r89	$89
+#define r90	$90
+#define r91	$91
+#define r92	$92
+#define r93	$93
+#define r94	$94
+#define r95	$95
+#define r96	$96
+#define r97	$97
+#define r98	$98
+#define r99	$99
+#define r100	$100
+#define r101	$101
+#define r102	$102
+#define r103	$103
+#define r104	$104
+#define r105	$105
+#define r106	$106
+#define r107	$107
+#define r108	$108
+#define r109	$109
+#define r110	$110
+#define r111	$111
+#define r112	$112
+#define r113	$113
+#define r114	$114
+#define r115	$115
+#define r116	$116
+#define r117	$117
+#define r118	$118
+#define r119	$119
+#define r120	$120
+#define r121	$121
+#define r122	$122
+#define r123	$123
+#define r124	$124
+#define r125	$125
+#define r126	$126
+#define r127	$127
+
+
+#define	lr	r0	// link register
+#define	sp	r1	// stack pointer
+                        // r2 is environment pointer for langs that need it (ALGOL)
+
+#define	retval	r3	// return values are passed in regs starting at r3
+
+#define	arg1	r3	// args are passed in regs starting at r3
+#define	arg2	r4
+#define	arg3	r5
+#define	arg4	r6
+#define	arg5	r7
+#define	arg6	r8
+#define	arg7	r9
+#define	arg8	r10
+#define	arg9	r11
+#define	arg10	r12
+
+//  r3 -  r74 are volatile (caller saves)
+// r74 -  r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog)
+// r80 - r127 are non-volatile (caller-saves)
+
+// scratch registers reserved for use by the macros in this file.
+
+#define _gc_t0	r79
+#define	_gc_t1	r78
+#define	_gc_t2	r77
+
+/*
+ * ----------------------------------------------------------------
+ * 		    	    pseudo ops
+ * ----------------------------------------------------------------
+ */
+#define PROC_ENTRY(name)		\
+        .text;				\
+	.p2align 4;			\
+	.global	name;			\
+	.type	name, @function;	\
+name:
+
+/*
+ * ----------------------------------------------------------------
+ * 		    aliases for common operations
+ * ----------------------------------------------------------------
+ */
+
+// Move register (even pipe, 2 cycles)
+#define MR(rt, ra) 			or	rt, ra, ra;
+
+// Move register (odd pipe, 4 cycles)
+#define	LMR(rt, ra) 			rotqbyi	rt, ra, 0;
+
+// return
+#define	RETURN() 			bi	lr;
+
+// hint for a return
+#define	HINT_RETURN(ret_label)		hbr	ret_label, lr;
+
+// return if zero
+#define BRZ_RETURN(rt)			biz	rt, lr;
+
+// return if not zero
+#define BRNZ_RETURN(rt)			binz	rt, lr;
+
+// return if halfword zero
+#define	BRHZ_RETURN(rt)			bihz	rt, lr;
+
+// return if halfword not zero
+#define BRHNZ_RETURN(rt)		bihnz	rt, lr;
+
+
+/*
+ * ----------------------------------------------------------------
+ * modulo like things for constant moduli that are powers of 2
+ * ----------------------------------------------------------------
+ */
+
+// rt = ra & (pow2 - 1)
+#define MODULO(rt, ra, pow2) \
+	andi	rt, ra, (pow2)-1;
+
+// rt = pow2 - (ra & (pow2 - 1))
+#define MODULO_NEG(rt, ra, pow2) \
+	andi	rt, ra, (pow2)-1;	      	\
+	sfi	rt, rt, (pow2);
+
+// rt = ra & -(pow2)
+#define	ROUND_DOWN(rt, ra, pow2) \
+	andi	rt, ra, -(pow2);
+
+// rt = (ra + (pow2 - 1)) & -(pow2)
+#define ROUND_UP(rt, ra, pow2) \
+	ai	rt, ra, (pow2)-1;	      	\
+	andi	rt, rt, -(pow2);
+
+/*
+ * ----------------------------------------------------------------
+ * Splat - replicate a particular slot into all slots
+ * Altivec analogs...
+ * ----------------------------------------------------------------
+ */
+
+// replicate byte from slot s [0,15]
+#define VSPLTB(rt, ra, s) \
+	ilh	_gc_t0, (s)*0x0101;	        \
+	shufb	rt, ra, ra, _gc_t0;
+
+// replicate halfword from slot s [0,7]
+#define	VSPLTH(rt, ra, s) \
+	ilh	_gc_t0, 2*(s)*0x0101 + 0x0001; 	\
+	shufb	rt, ra, ra, _gc_t0;
+
+// replicate word from slot s [0,3]
+#define VSPLTW(rt, ra, s) \
+	iluh	_gc_t0, 4*(s)*0x0101 + 0x0001;	\
+	iohl	_gc_t0, 4*(s)*0x0101 + 0x0203;	\
+	shufb	rt, ra, ra, _gc_t0;
+	
+// replicate double from slot s [0,1]
+#define	VSPLTD(rt, ra, s) \
+	/* sp is always 16-byte aligned */ \
+	cdd	_gc_t0, 8(sp);		/* 0x10111213 14151617 00010203 04050607 */ \
+	rotqbyi	rt, ra, ra, (s) << 3;	/* rotate double into preferred slot 	 */ \
+	shufb	rt, rt, rt, _gc_t0;
+
+/*
+ * ----------------------------------------------------------------
+ * lots of min/max variations...
+ *
+ * On a slot by slot basis, compute the min or max
+ *
+ * U - unsigned, else signed
+ * B,H,{} - byte, halfword, word
+ * F float
+ * ----------------------------------------------------------------
+ */
+
+#define MIN_SELB(rt, ra, rb, rc)	selb	rt, ra, rb, rc;
+#define MAX_SELB(rt, ra, rb, rc)	selb	rt, rb, ra, rc;
+	
+	// words
+
+#define MIN(rt, ra, rb) \
+	cgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAX(rt, ra, rb) \
+	cgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMIN(rt, ra, rb) \
+	clgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAX(rt, ra, rb) \
+	clgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// bytes
+	
+#define MINB(rt, ra, rb) \
+	cgtb	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAXB(rt, ra, rb) \
+	cgtb	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINB(rt, ra, rb) \
+	clgtb	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAXB(rt, ra, rb) \
+	clgtb	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// halfwords
+	
+#define MINH(rt, ra, rb) \
+	cgth	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAXH(rt, ra, rb) \
+	cgth	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINH(rt, ra, rb) \
+	clgth	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAXH(rt, ra, rb) \
+	clgth	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// floats
+	
+#define FMIN(rt, ra, rb) \
+	fcgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	FMAX(rt, ra, rb) \
+	fcgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the minimum magnitude
+#define FMINMAG(rt, ra, rb) \
+	fcmgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+// Ignoring the sign, select the values with the maximum magnitude
+#define	FMAXMAG(rt, ra, rb) \
+	fcmgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+
+#endif /* INCLUDED_GC_SPU_MACS_H */
diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
new file mode 100644
index 000000000..765cacd9a
--- /dev/null
+++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
@@ -0,0 +1,160 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+	vector signed short vec_val = spu_splats(val);
+	vector unsigned short compare;
+	vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1};
+	vector unsigned short after_and;
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+		compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+		after_and = spu_and(compare, ones);
+		
+		
+		out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	
+	compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+	after_and = spu_and(compare, ones);
+		
+
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+
+	int i = 0;
+	for(i = 0; i < 48; i += 2){
+		bear[i] = i;
+		bear[i + 1] = -i;
+	}
+
+	vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short));
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+*/	
+
diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
new file mode 100644
index 000000000..a3ce6c2fe
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_subtract_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+    vector signed short sum;
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+	signed short res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = i;
+	}
+
+	vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short));
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", res[i]);
+	}
+	printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
new file mode 100644
index 000000000..5a1cb9aaf
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_sum_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+    vector signed int sum;
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+	signed short res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = i;
+	}
+
+	vector_sum(&pooh[9], &pooh[9], &bear[3], 30);
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
new file mode 100644
index 000000000..58fd4aa0c
--- /dev/null
+++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
@@ -0,0 +1,222 @@
+#include<spu_intrinsics.h>
+
+
+
+
+void* libvector_pointwise_multiply_32fc_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
+													  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
+	vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+													  0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
+	vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+													  0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+	vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+										 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+	
+	vector float prod0;
+	qword shuf0;
+	vector float prod1;
+	vector float sign_change;
+	qword summand0;
+	qword summand1;
+	vector float sum;
+	
+
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+		shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+		prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
+		sign_change = spu_xor(prod0, (vector float)sign_changer);
+		
+		summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+			   
+		summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+		
+		sum = spu_add((vector float)summand0, (vector float)summand1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	
+	
+	prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+	shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+	prod1 = spu_mul(prod0, (vector float)shuf0);
+	sign_change = spu_xor(prod0, (vector float)sign_changer);
+	summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+	summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+	sum = spu_add((vector float)summand0, (vector float)summand1);
+	
+	
+	
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+	
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	float pooh[48];
+	float bear[48];
+	float res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = (float) i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = (float) i;
+	}
+
+	vector_product_complex(res, pooh, bear, 48*sizeof(float));
+
+	
+
+	for(i = 0; i < 48; ++i) {
+		printf("%f, ", res[i]);
+	}
+	printf("\n");
+
+	
+}
+*/
+
diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c
new file mode 100644
index 000000000..2a0dabcd7
--- /dev/null
+++ b/volk/spu_lib/spu_memcpy_unaligned.c
@@ -0,0 +1,290 @@
+#include<libvector/libvector_memcpy_unaligned.h
+#include<spu_intrinsics.h>
+
+void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+		
+		out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+void* mcpy(void* target, void* src, size_t num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = src;
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)(src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16));
+	
+	//alpha: first half of first, second half of second, break at target%16
+	src_cmp = spu_splats((unsigned char)(target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	  printf("num_bytesmod16 %d\n", num_bytes%16);
+	printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", 
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 0),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 1),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 2),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 3),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 4),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 5),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 6),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 7),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 8),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 9),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 10),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 11),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 12),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 13),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 14),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 15));
+	
+
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+		
+		out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+	return retval;
+	   
+}
+*/
+/*
+int main(){
+
+	unsigned char pooh[48];
+	unsigned char bear[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+		bear[i] = i;
+	}
+
+	spu_mcpy(&pooh[9],&bear[3], 15);
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+	
+*/
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
new file mode 100644
index 000000000..a655c4c52
--- /dev/null
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -0,0 +1,185 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gc_spu_macs.h"
+
+	.file "spu_memset_unaligned.S"
+
+	/*
+	 * Computes this, only a lot faster...
+	 *
+	 *	void *
+	 *	libvector_memset_unaligned(void *pv, int c, size_t n)
+	 *	{
+	 *	  unsigned char *p = (unsigned char *) pv;
+	 *	  size_t i;
+	 *	  for (i = 0; i < n; i++)
+	 *	    p[i] = c;
+	 *	
+	 *	  return pv;
+	 *	}
+	 */
+	
+#define	p_arg	arg1	// we're going to clobber arg1 w/ the return value
+#define	c	arg2	// the constant we're writing
+#define	n	arg3	// how many bytes to write
+
+#define	p	r13	// where we're writing
+#define	t0	r14
+#define t1	r15
+#define	mask	r16
+#define	old	r17
+#define an	r18	// aligned n (n rounded down to mod 16 boundary)
+#define	next_p	r19
+#define	cond1	r20
+#define	cond2	r21				
+#define m	r22
+#define r	r23
+	
+	PROC_ENTRY(libvector_memset_unaligned)
+	
+	// Hint the return from do_head, in case we go that way.
+	// There's pretty much nothing to can do to hint the branch to it.
+	hbrr	do_head_br, head_complete
+	
+	MR(p, p_arg)	// leaves p, the return value, in the correct reg (r3)
+	BRZ_RETURN(n)
+
+	MODULO(t0, p, 16)	// is p%16 == 0?
+	VSPLTB(c, c, 3)		// splat byte in preferred slot of c into all slots
+	brnz	t0, do_head	// no, handle it
+head_complete:
+
+	/*
+	 * preconditions:	
+	 *   p%16 == 0, n > 0
+	 */
+	hbrr	middle_loop_br, middle_loop
+	
+	ROUND_DOWN(an, n, 16)	// an is "aligned n"
+	MODULO(n, n, 16)	// what's left over in the last quad
+	brz	an, do_tail	// no whole quad words; skip to tail
+	clgti	t0, an, 127	// an >= 128?
+	brz	t0, middle2	// nope, go handle the cases between 0 and 112
+
+	/*
+	 * 128 bytes / iteration
+	 */
+	.p2align 4
+middle_loop:
+	ai	an, an, -128
+	  stqd	c,  0*16(p)
+	ai	next_p, p, 128
+	  stqd	c,  1*16(p)
+	cgti	cond1, an, 127
+	  stqd	c,  2*16(p)
+
+	  stqd	c,  3*16(p)
+	  stqd	c,  4*16(p)
+	  stqd	c,  5*16(p)
+	  stqd	c,  6*16(p)
+	
+	MR(p, next_p)
+	  stqd	c,  7*16-128(next_p)
+	or	cond2, n, an
+middle_loop_br:
+	  brnz	cond1, middle_loop
+	
+	/*
+	 * if an and n are both zero, return now 
+	 */
+	BRZ_RETURN(cond2)
+
+	/*
+	 * otherwise handle last of full quad words 
+	 *
+	 *   0 <= an < 128, p%16 == 0
+	 */
+middle2:
+	/*
+	 * if an == 0, go handle the final non-full quadword
+	 */
+	brz	an, do_tail
+	hbrr	middle2_loop_br, middle2_loop
+	
+	.p2align 3
+middle2_loop:	
+	ai	next_p, p, 16
+	  stqd	c, 0(p)
+	ai	an, an, -16
+	  LMR(p, next_p)
+middle2_loop_br:
+	  brnz	an, middle2_loop
+	
+	/* We're done with the full quadwords. */
+	
+	/*
+	 * Handle the final partial quadword.
+	 * We'll be modifying only the left hand portion of the quad.
+	 *
+	 * preconditions:
+	 *   an == 0, 0 <= n < 16, p%16 == 0
+	 */
+do_tail:
+	HINT_RETURN(do_tail_ret)
+	il	mask, -1
+	sfi	t1, n, 16		// t1 = 16 - n
+	lqd	old, 0(p)
+	shlqby  mask, mask, t1
+	selb	t0, old, c, mask
+	stqd	t0, 0(p)
+do_tail_ret:	
+	RETURN()
+
+	/*
+	 * ----------------------------------------------------------------
+	 * Handle the first partial quadword
+	 *
+	 * preconditions:
+	 *   p%16 != 0
+	 *
+         * postconditions:
+         *   p%16 == 0 or n == 0
+         *
+         *        |-- m --|
+         *     +----------------+----------------+
+         *     |  ////////      |                |
+         *     +----------------+----------------+
+         *        |----- r -----|
+         *        p
+         * ----------------------------------------------------------------
+	 */
+do_head:
+	lqd	old, 0(p)
+	MODULO_NEG(r, p, 16)
+	il	mask, -1
+	UMIN(m, r, n)
+	shlqby	mask, mask, m	// 1's in the top, m*8 0's in the bottom
+	MR(t1, p)
+	sf	t0, m, r	// t0 = r - m
+	a	p, p, m		// p += m
+	rotqby	mask, mask, t0	// rotate 0's to the right place	
+	sf	n, m, n		// n -= m
+	selb	t0, c, old, mask // merge
+	stqd	t0, 0(t1)
+	BRZ_RETURN(n)
+do_head_br:
+	br	head_complete
diff --git a/volk/system_cleanup.sh b/volk/system_cleanup.sh
new file mode 100755
index 000000000..7ebd2caf8
--- /dev/null
+++ b/volk/system_cleanup.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+cp volk_config.h include/volk/
+cd lib
+case $1 in
+    (x86)
+    
+    case $2 in
+	(x86_64)
+	gcc -o volk_mktables -I../include -I. volk_cpu_x86.c cpuid_x86_64.S volk_rank_archs.c volk_mktables.c
+	./volk_mktables
+	;;
+	(*)
+	gcc -o volk_mktables -I../include -I. volk_cpu_x86.c cpuid_x86.S volk_rank_archs.c volk_mktables.c
+	./volk_mktables
+	;;
+    esac
+    ;;
+    (powerpc)
+    gcc -o volk_mktables -I../include -I. volk_cpu_powerpc.c volk_rank_archs.c volk_mktables.c
+    ./volk_mktables
+    ;;
+    (*)
+    gcc -o volk_mktables -I../include -I. volk_cpu_generic.c volk_rank_archs.c volk_mktables.c
+    ./volk_mktables
+    ;;
+esac
+mv volk_tables.h ../include/volk/
+
diff --git a/volk/volk.pc.in b/volk/volk.pc.in
new file mode 100644
index 000000000..a24298856
--- /dev/null
+++ b/volk/volk.pc.in
@@ -0,0 +1,15 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+LV_CXXFLAGS=@LV_CXXFLAGS@
+
+
+
+Name: volk
+Description: VOLK.. Vector Optimized Library of Kernels
+Requires:
+Version: @VERSION@
+Libs: -lvolk -lvolk_runtime
+Cflags: -I${includedir} ${LV_CXXFLAGS}
+