From 239144659b29c0a5ecd83a34e0e57387a1060ed7 Mon Sep 17 00:00:00 2001
From: Tom Rondeau
Date: Tue, 7 Dec 2010 18:50:28 -0500
Subject: Initial checkin for VOLK - Vector-Optimized Library of Kernels. This
 is a new SIMD library.

It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time.
---
 volk/spu_lib/gc_spu_macs.h                         | 380 +++++++++++++++++++++
 volk/spu_lib/spu_16s_cmpgt_unaligned.c             | 160 +++++++++
 volk/spu_lib/spu_16s_vector_subtract_unaligned.c   | 178 ++++++++++
 volk/spu_lib/spu_16s_vector_sum_unaligned.c        | 178 ++++++++++
 .../spu_32fc_pointwise_multiply_unaligned.c        | 222 ++++++++++++
 volk/spu_lib/spu_memcpy_unaligned.c                | 290 ++++++++++++++++
 volk/spu_lib/spu_memset_unaligned.S                | 185 ++++++++++
 7 files changed, 1593 insertions(+)
 create mode 100644 volk/spu_lib/gc_spu_macs.h
 create mode 100644 volk/spu_lib/spu_16s_cmpgt_unaligned.c
 create mode 100644 volk/spu_lib/spu_16s_vector_subtract_unaligned.c
 create mode 100644 volk/spu_lib/spu_16s_vector_sum_unaligned.c
 create mode 100644 volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
 create mode 100644 volk/spu_lib/spu_memcpy_unaligned.c
 create mode 100644 volk/spu_lib/spu_memset_unaligned.S

(limited to 'volk/spu_lib')

diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h
new file mode 100644
index 000000000..8e3e3f2a6
--- /dev/null
+++ b/volk/spu_lib/gc_spu_macs.h
@@ -0,0 +1,380 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef INCLUDED_GC_SPU_MACS_H
+#define INCLUDED_GC_SPU_MACS_H
+
+/*
+ * This file contains a set of macros that are generally useful when
+ * coding in SPU assembler
+ *
+ * Note that the multi-instruction macros in here may overwrite
+ * registers 77, 78, and 79 without warning.
+ */
+
+/*
+ * defines for all registers
+ */
+#define r0	$0
+#define r1	$1
+#define r2	$2
+#define r3	$3
+#define r4	$4
+#define r5	$5
+#define r6	$6
+#define r7	$7
+#define r8	$8
+#define r9	$9
+#define r10	$10
+#define r11	$11
+#define r12	$12
+#define r13	$13
+#define r14	$14
+#define r15	$15
+#define r16	$16
+#define r17	$17
+#define r18	$18
+#define r19	$19
+#define r20	$20
+#define r21	$21
+#define r22	$22
+#define r23	$23
+#define r24	$24
+#define r25	$25
+#define r26	$26
+#define r27	$27
+#define r28	$28
+#define r29	$29
+#define r30	$30
+#define r31	$31
+#define r32	$32
+#define r33	$33
+#define r34	$34
+#define r35	$35
+#define r36	$36
+#define r37	$37
+#define r38	$38
+#define r39	$39
+#define r40	$40
+#define r41	$41
+#define r42	$42
+#define r43	$43
+#define r44	$44
+#define r45	$45
+#define r46	$46
+#define r47	$47
+#define r48	$48
+#define r49	$49
+#define r50	$50
+#define r51	$51
+#define r52	$52
+#define r53	$53
+#define r54	$54
+#define r55	$55
+#define r56	$56
+#define r57	$57
+#define r58	$58
+#define r59	$59
+#define r60	$60
+#define r61	$61
+#define r62	$62
+#define r63	$63
+#define r64	$64
+#define r65	$65
+#define r66	$66
+#define r67	$67
+#define r68	$68
+#define r69	$69
+#define r70	$70
+#define r71	$71
+#define r72	$72
+#define r73	$73
+#define r74	$74
+#define r75	$75
+#define r76	$76
+#define r77	$77
+#define r78	$78
+#define r79	$79
+#define r80	$80
+#define r81	$81
+#define r82	$82
+#define r83	$83
+#define r84	$84
+#define r85	$85
+#define r86	$86
+#define r87	$87
+#define r88	$88
+#define r89	$89
+#define r90	$90
+#define r91	$91
+#define r92	$92
+#define r93	$93
+#define r94	$94
+#define r95	$95
+#define r96	$96
+#define r97	$97
+#define r98	$98
+#define r99	$99
+#define r100	$100
+#define r101	$101
+#define r102	$102
+#define r103	$103
+#define r104	$104
+#define r105	$105
+#define r106	$106
+#define r107	$107
+#define r108	$108
+#define r109	$109
+#define r110	$110
+#define r111	$111
+#define r112	$112
+#define r113	$113
+#define r114	$114
+#define r115	$115
+#define r116	$116
+#define r117	$117
+#define r118	$118
+#define r119	$119
+#define r120	$120
+#define r121	$121
+#define r122	$122
+#define r123	$123
+#define r124	$124
+#define r125	$125
+#define r126	$126
+#define r127	$127
+
+
+#define	lr	r0	// link register
+#define	sp	r1	// stack pointer
+                        // r2 is environment pointer for langs that need it (ALGOL)
+
+#define	retval	r3	// return values are passed in regs starting at r3
+
+#define	arg1	r3	// args are passed in regs starting at r3
+#define	arg2	r4
+#define	arg3	r5
+#define	arg4	r6
+#define	arg5	r7
+#define	arg6	r8
+#define	arg7	r9
+#define	arg8	r10
+#define	arg9	r11
+#define	arg10	r12
+
+//  r3 -  r74 are volatile (caller saves)
+// r74 -  r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog)
+// r80 - r127 are non-volatile (caller-saves)
+
+// scratch registers reserved for use by the macros in this file.
+
+#define _gc_t0	r79
+#define	_gc_t1	r78
+#define	_gc_t2	r77
+
+/*
+ * ----------------------------------------------------------------
+ * 		    	    pseudo ops
+ * ----------------------------------------------------------------
+ */
+#define PROC_ENTRY(name)		\
+        .text;				\
+	.p2align 4;			\
+	.global	name;			\
+	.type	name, @function;	\
+name:
+
+/*
+ * ----------------------------------------------------------------
+ * 		    aliases for common operations
+ * ----------------------------------------------------------------
+ */
+
+// Move register (even pipe, 2 cycles)
+#define MR(rt, ra) 			or	rt, ra, ra;
+
+// Move register (odd pipe, 4 cycles)
+#define	LMR(rt, ra) 			rotqbyi	rt, ra, 0;
+
+// return
+#define	RETURN() 			bi	lr;
+
+// hint for a return
+#define	HINT_RETURN(ret_label)		hbr	ret_label, lr;
+
+// return if zero
+#define BRZ_RETURN(rt)			biz	rt, lr;
+
+// return if not zero
+#define BRNZ_RETURN(rt)			binz	rt, lr;
+
+// return if halfword zero
+#define	BRHZ_RETURN(rt)			bihz	rt, lr;
+
+// return if halfword not zero
+#define BRHNZ_RETURN(rt)		bihnz	rt, lr;
+
+
+/*
+ * ----------------------------------------------------------------
+ * modulo like things for constant moduli that are powers of 2
+ * ----------------------------------------------------------------
+ */
+
+// rt = ra & (pow2 - 1)
+#define MODULO(rt, ra, pow2) \
+	andi	rt, ra, (pow2)-1;
+
+// rt = pow2 - (ra & (pow2 - 1))
+#define MODULO_NEG(rt, ra, pow2) \
+	andi	rt, ra, (pow2)-1;	      	\
+	sfi	rt, rt, (pow2);
+
+// rt = ra & -(pow2)
+#define	ROUND_DOWN(rt, ra, pow2) \
+	andi	rt, ra, -(pow2);
+
+// rt = (ra + (pow2 - 1)) & -(pow2)
+#define ROUND_UP(rt, ra, pow2) \
+	ai	rt, ra, (pow2)-1;	      	\
+	andi	rt, rt, -(pow2);
+
+/*
+ * ----------------------------------------------------------------
+ * Splat - replicate a particular slot into all slots
+ * Altivec analogs...
+ * ----------------------------------------------------------------
+ */
+
+// replicate byte from slot s [0,15]
+#define VSPLTB(rt, ra, s) \
+	ilh	_gc_t0, (s)*0x0101;	        \
+	shufb	rt, ra, ra, _gc_t0;
+
+// replicate halfword from slot s [0,7]
+#define	VSPLTH(rt, ra, s) \
+	ilh	_gc_t0, 2*(s)*0x0101 + 0x0001; 	\
+	shufb	rt, ra, ra, _gc_t0;
+
+// replicate word from slot s [0,3]
+#define VSPLTW(rt, ra, s) \
+	iluh	_gc_t0, 4*(s)*0x0101 + 0x0001;	\
+	iohl	_gc_t0, 4*(s)*0x0101 + 0x0203;	\
+	shufb	rt, ra, ra, _gc_t0;
+	
+// replicate double from slot s [0,1]
+#define	VSPLTD(rt, ra, s) \
+	/* sp is always 16-byte aligned */ \
+	cdd	_gc_t0, 8(sp);		/* 0x10111213 14151617 00010203 04050607 */ \
+	rotqbyi	rt, ra, ra, (s) << 3;	/* rotate double into preferred slot 	 */ \
+	shufb	rt, rt, rt, _gc_t0;
+
+/*
+ * ----------------------------------------------------------------
+ * lots of min/max variations...
+ *
+ * On a slot by slot basis, compute the min or max
+ *
+ * U - unsigned, else signed
+ * B,H,{} - byte, halfword, word
+ * F float
+ * ----------------------------------------------------------------
+ */
+
+#define MIN_SELB(rt, ra, rb, rc)	selb	rt, ra, rb, rc;
+#define MAX_SELB(rt, ra, rb, rc)	selb	rt, rb, ra, rc;
+	
+	// words
+
+#define MIN(rt, ra, rb) \
+	cgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAX(rt, ra, rb) \
+	cgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMIN(rt, ra, rb) \
+	clgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAX(rt, ra, rb) \
+	clgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// bytes
+	
+#define MINB(rt, ra, rb) \
+	cgtb	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAXB(rt, ra, rb) \
+	cgtb	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINB(rt, ra, rb) \
+	clgtb	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAXB(rt, ra, rb) \
+	clgtb	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// halfwords
+	
+#define MINH(rt, ra, rb) \
+	cgth	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAXH(rt, ra, rb) \
+	cgth	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINH(rt, ra, rb) \
+	clgth	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAXH(rt, ra, rb) \
+	clgth	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// floats
+	
+#define FMIN(rt, ra, rb) \
+	fcgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	FMAX(rt, ra, rb) \
+	fcgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the minimum magnitude
+#define FMINMAG(rt, ra, rb) \
+	fcmgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+// Ignoring the sign, select the values with the maximum magnitude
+#define	FMAXMAG(rt, ra, rb) \
+	fcmgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+
+#endif /* INCLUDED_GC_SPU_MACS_H */
diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
new file mode 100644
index 000000000..765cacd9a
--- /dev/null
+++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
@@ -0,0 +1,160 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+	vector signed short vec_val = spu_splats(val);
+	vector unsigned short compare;
+	vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1};
+	vector unsigned short after_and;
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+		compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+		after_and = spu_and(compare, ones);
+		
+		
+		out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	
+	compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+	after_and = spu_and(compare, ones);
+		
+
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+
+	int i = 0;
+	for(i = 0; i < 48; i += 2){
+		bear[i] = i;
+		bear[i + 1] = -i;
+	}
+
+	vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short));
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+*/	
+
diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
new file mode 100644
index 000000000..a3ce6c2fe
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_subtract_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+    vector signed short sum;
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+	signed short res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = i;
+	}
+
+	vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short));
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", res[i]);
+	}
+	printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
new file mode 100644
index 000000000..5a1cb9aaf
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_sum_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+    vector signed int sum;
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+	signed short res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = i;
+	}
+
+	vector_sum(&pooh[9], &pooh[9], &bear[3], 30);
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
new file mode 100644
index 000000000..58fd4aa0c
--- /dev/null
+++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
@@ -0,0 +1,222 @@
+#include<spu_intrinsics.h>
+
+
+
+
+void* libvector_pointwise_multiply_32fc_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
+													  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
+	vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+													  0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
+	vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+													  0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+	vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+										 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+	
+	vector float prod0;
+	qword shuf0;
+	vector float prod1;
+	vector float sign_change;
+	qword summand0;
+	qword summand1;
+	vector float sum;
+	
+
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+		shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+		prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
+		sign_change = spu_xor(prod0, (vector float)sign_changer);
+		
+		summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+			   
+		summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+		
+		sum = spu_add((vector float)summand0, (vector float)summand1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	
+	
+	prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+	shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+	prod1 = spu_mul(prod0, (vector float)shuf0);
+	sign_change = spu_xor(prod0, (vector float)sign_changer);
+	summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+	summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+	sum = spu_add((vector float)summand0, (vector float)summand1);
+	
+	
+	
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+	
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	float pooh[48];
+	float bear[48];
+	float res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = (float) i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = (float) i;
+	}
+
+	vector_product_complex(res, pooh, bear, 48*sizeof(float));
+
+	
+
+	for(i = 0; i < 48; ++i) {
+		printf("%f, ", res[i]);
+	}
+	printf("\n");
+
+	
+}
+*/
+
diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c
new file mode 100644
index 000000000..2a0dabcd7
--- /dev/null
+++ b/volk/spu_lib/spu_memcpy_unaligned.c
@@ -0,0 +1,290 @@
+#include<libvector/libvector_memcpy_unaligned.h
+#include<spu_intrinsics.h>
+
+void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+		
+		out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+void* mcpy(void* target, void* src, size_t num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = src;
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)(src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16));
+	
+	//alpha: first half of first, second half of second, break at target%16
+	src_cmp = spu_splats((unsigned char)(target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	  printf("num_bytesmod16 %d\n", num_bytes%16);
+	printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", 
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 0),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 1),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 2),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 3),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 4),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 5),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 6),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 7),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 8),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 9),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 10),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 11),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 12),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 13),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 14),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 15));
+	
+
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+		
+		out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+	return retval;
+	   
+}
+*/
+/*
+int main(){
+
+	unsigned char pooh[48];
+	unsigned char bear[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+		bear[i] = i;
+	}
+
+	spu_mcpy(&pooh[9],&bear[3], 15);
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+	
+*/
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
new file mode 100644
index 000000000..a655c4c52
--- /dev/null
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -0,0 +1,185 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gc_spu_macs.h"
+
+	.file "spu_memset_unaligned.S"
+
+	/*
+	 * Computes this, only a lot faster...
+	 *
+	 *	void *
+	 *	libvector_memset_unaligned(void *pv, int c, size_t n)
+	 *	{
+	 *	  unsigned char *p = (unsigned char *) pv;
+	 *	  size_t i;
+	 *	  for (i = 0; i < n; i++)
+	 *	    p[i] = c;
+	 *	
+	 *	  return pv;
+	 *	}
+	 */
+	
+#define	p_arg	arg1	// we're going to clobber arg1 w/ the return value
+#define	c	arg2	// the constant we're writing
+#define	n	arg3	// how many bytes to write
+
+#define	p	r13	// where we're writing
+#define	t0	r14
+#define t1	r15
+#define	mask	r16
+#define	old	r17
+#define an	r18	// aligned n (n rounded down to mod 16 boundary)
+#define	next_p	r19
+#define	cond1	r20
+#define	cond2	r21				
+#define m	r22
+#define r	r23
+	
+	PROC_ENTRY(libvector_memset_unaligned)
+	
+	// Hint the return from do_head, in case we go that way.
+	// There's pretty much nothing to can do to hint the branch to it.
+	hbrr	do_head_br, head_complete
+	
+	MR(p, p_arg)	// leaves p, the return value, in the correct reg (r3)
+	BRZ_RETURN(n)
+
+	MODULO(t0, p, 16)	// is p%16 == 0?
+	VSPLTB(c, c, 3)		// splat byte in preferred slot of c into all slots
+	brnz	t0, do_head	// no, handle it
+head_complete:
+
+	/*
+	 * preconditions:	
+	 *   p%16 == 0, n > 0
+	 */
+	hbrr	middle_loop_br, middle_loop
+	
+	ROUND_DOWN(an, n, 16)	// an is "aligned n"
+	MODULO(n, n, 16)	// what's left over in the last quad
+	brz	an, do_tail	// no whole quad words; skip to tail
+	clgti	t0, an, 127	// an >= 128?
+	brz	t0, middle2	// nope, go handle the cases between 0 and 112
+
+	/*
+	 * 128 bytes / iteration
+	 */
+	.p2align 4
+middle_loop:
+	ai	an, an, -128
+	  stqd	c,  0*16(p)
+	ai	next_p, p, 128
+	  stqd	c,  1*16(p)
+	cgti	cond1, an, 127
+	  stqd	c,  2*16(p)
+
+	  stqd	c,  3*16(p)
+	  stqd	c,  4*16(p)
+	  stqd	c,  5*16(p)
+	  stqd	c,  6*16(p)
+	
+	MR(p, next_p)
+	  stqd	c,  7*16-128(next_p)
+	or	cond2, n, an
+middle_loop_br:
+	  brnz	cond1, middle_loop
+	
+	/*
+	 * if an and n are both zero, return now 
+	 */
+	BRZ_RETURN(cond2)
+
+	/*
+	 * otherwise handle last of full quad words 
+	 *
+	 *   0 <= an < 128, p%16 == 0
+	 */
+middle2:
+	/*
+	 * if an == 0, go handle the final non-full quadword
+	 */
+	brz	an, do_tail
+	hbrr	middle2_loop_br, middle2_loop
+	
+	.p2align 3
+middle2_loop:	
+	ai	next_p, p, 16
+	  stqd	c, 0(p)
+	ai	an, an, -16
+	  LMR(p, next_p)
+middle2_loop_br:
+	  brnz	an, middle2_loop
+	
+	/* We're done with the full quadwords. */
+	
+	/*
+	 * Handle the final partial quadword.
+	 * We'll be modifying only the left hand portion of the quad.
+	 *
+	 * preconditions:
+	 *   an == 0, 0 <= n < 16, p%16 == 0
+	 */
+do_tail:
+	HINT_RETURN(do_tail_ret)
+	il	mask, -1
+	sfi	t1, n, 16		// t1 = 16 - n
+	lqd	old, 0(p)
+	shlqby  mask, mask, t1
+	selb	t0, old, c, mask
+	stqd	t0, 0(p)
+do_tail_ret:	
+	RETURN()
+
+	/*
+	 * ----------------------------------------------------------------
+	 * Handle the first partial quadword
+	 *
+	 * preconditions:
+	 *   p%16 != 0
+	 *
+         * postconditions:
+         *   p%16 == 0 or n == 0
+         *
+         *        |-- m --|
+         *     +----------------+----------------+
+         *     |  ////////      |                |
+         *     +----------------+----------------+
+         *        |----- r -----|
+         *        p
+         * ----------------------------------------------------------------
+	 */
+do_head:
+	lqd	old, 0(p)
+	MODULO_NEG(r, p, 16)
+	il	mask, -1
+	UMIN(m, r, n)
+	shlqby	mask, mask, m	// 1's in the top, m*8 0's in the bottom
+	MR(t1, p)
+	sf	t0, m, r	// t0 = r - m
+	a	p, p, m		// p += m
+	rotqby	mask, mask, t0	// rotate 0's to the right place	
+	sf	n, m, n		// n -= m
+	selb	t0, c, old, mask // merge
+	stqd	t0, 0(t1)
+	BRZ_RETURN(n)
+do_head_br:
+	br	head_complete
-- 
cgit