7 files changed, 1593 insertions, 0 deletions
diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h
new file mode 100644
index 000000000..8e3e3f2a6
--- /dev/null
+++ b/volk/spu_lib/gc_spu_macs.h
@@ -0,0 +1,380 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef INCLUDED_GC_SPU_MACS_H
+#define INCLUDED_GC_SPU_MACS_H
+
+/*
+ * This file contains a set of macros that are generally useful when
+ * coding in SPU assembler
+ *
+ * Note that the multi-instruction macros in here may overwrite
+ * registers 77, 78, and 79 without warning.
+ */
+
+/*
+ * defines for all registers
+ */
+#define r0	$0
+#define r1	$1
+#define r2	$2
+#define r3	$3
+#define r4	$4
+#define r5	$5
+#define r6	$6
+#define r7	$7
+#define r8	$8
+#define r9	$9
+#define r10	$10
+#define r11	$11
+#define r12	$12
+#define r13	$13
+#define r14	$14
+#define r15	$15
+#define r16	$16
+#define r17	$17
+#define r18	$18
+#define r19	$19
+#define r20	$20
+#define r21	$21
+#define r22	$22
+#define r23	$23
+#define r24	$24
+#define r25	$25
+#define r26	$26
+#define r27	$27
+#define r28	$28
+#define r29	$29
+#define r30	$30
+#define r31	$31
+#define r32	$32
+#define r33	$33
+#define r34	$34
+#define r35	$35
+#define r36	$36
+#define r37	$37
+#define r38	$38
+#define r39	$39
+#define r40	$40
+#define r41	$41
+#define r42	$42
+#define r43	$43
+#define r44	$44
+#define r45	$45
+#define r46	$46
+#define r47	$47
+#define r48	$48
+#define r49	$49
+#define r50	$50
+#define r51	$51
+#define r52	$52
+#define r53	$53
+#define r54	$54
+#define r55	$55
+#define r56	$56
+#define r57	$57
+#define r58	$58
+#define r59	$59
+#define r60	$60
+#define r61	$61
+#define r62	$62
+#define r63	$63
+#define r64	$64
+#define r65	$65
+#define r66	$66
+#define r67	$67
+#define r68	$68
+#define r69	$69
+#define r70	$70
+#define r71	$71
+#define r72	$72
+#define r73	$73
+#define r74	$74
+#define r75	$75
+#define r76	$76
+#define r77	$77
+#define r78	$78
+#define r79	$79
+#define r80	$80
+#define r81	$81
+#define r82	$82
+#define r83	$83
+#define r84	$84
+#define r85	$85
+#define r86	$86
+#define r87	$87
+#define r88	$88
+#define r89	$89
+#define r90	$90
+#define r91	$91
+#define r92	$92
+#define r93	$93
+#define r94	$94
+#define r95	$95
+#define r96	$96
+#define r97	$97
+#define r98	$98
+#define r99	$99
+#define r100	$100
+#define r101	$101
+#define r102	$102
+#define r103	$103
+#define r104	$104
+#define r105	$105
+#define r106	$106
+#define r107	$107
+#define r108	$108
+#define r109	$109
+#define r110	$110
+#define r111	$111
+#define r112	$112
+#define r113	$113
+#define r114	$114
+#define r115	$115
+#define r116	$116
+#define r117	$117
+#define r118	$118
+#define r119	$119
+#define r120	$120
+#define r121	$121
+#define r122	$122
+#define r123	$123
+#define r124	$124
+#define r125	$125
+#define r126	$126
+#define r127	$127
+
+
+#define	lr	r0	// link register
+#define	sp	r1	// stack pointer
+                        // r2 is environment pointer for langs that need it (ALGOL)
+
+#define	retval	r3	// return values are passed in regs starting at r3
+
+#define	arg1	r3	// args are passed in regs starting at r3
+#define	arg2	r4
+#define	arg3	r5
+#define	arg4	r6
+#define	arg5	r7
+#define	arg6	r8
+#define	arg7	r9
+#define	arg8	r10
+#define	arg9	r11
+#define	arg10	r12
+
+//  r3 -  r74 are volatile (caller saves)
+// r74 -  r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog)
+// r80 - r127 are non-volatile (caller-saves)
+
+// scratch registers reserved for use by the macros in this file.
+
+#define _gc_t0	r79
+#define	_gc_t1	r78
+#define	_gc_t2	r77
+
+/*
+ * ----------------------------------------------------------------
+ * 		    	    pseudo ops
+ * ----------------------------------------------------------------
+ */
+#define PROC_ENTRY(name)		\
+        .text;				\
+	.p2align 4;			\
+	.global	name;			\
+	.type	name, @function;	\
+name:
+
+/*
+ * ----------------------------------------------------------------
+ * 		    aliases for common operations
+ * ----------------------------------------------------------------
+ */
+
+// Move register (even pipe, 2 cycles)
+#define MR(rt, ra) 			or	rt, ra, ra;
+
+// Move register (odd pipe, 4 cycles)
+#define	LMR(rt, ra) 			rotqbyi	rt, ra, 0;
+
+// return
+#define	RETURN() 			bi	lr;
+
+// hint for a return
+#define	HINT_RETURN(ret_label)		hbr	ret_label, lr;
+
+// return if zero
+#define BRZ_RETURN(rt)			biz	rt, lr;
+
+// return if not zero
+#define BRNZ_RETURN(rt)			binz	rt, lr;
+
+// return if halfword zero
+#define	BRHZ_RETURN(rt)			bihz	rt, lr;
+
+// return if halfword not zero
+#define BRHNZ_RETURN(rt)		bihnz	rt, lr;
+
+
+/*
+ * ----------------------------------------------------------------
+ * modulo like things for constant moduli that are powers of 2
+ * ----------------------------------------------------------------
+ */
+
+// rt = ra & (pow2 - 1)
+#define MODULO(rt, ra, pow2) \
+	andi	rt, ra, (pow2)-1;
+
+// rt = pow2 - (ra & (pow2 - 1))
+#define MODULO_NEG(rt, ra, pow2) \
+	andi	rt, ra, (pow2)-1;	      	\
+	sfi	rt, rt, (pow2);
+
+// rt = ra & -(pow2)
+#define	ROUND_DOWN(rt, ra, pow2) \
+	andi	rt, ra, -(pow2);
+
+// rt = (ra + (pow2 - 1)) & -(pow2)
+#define ROUND_UP(rt, ra, pow2) \
+	ai	rt, ra, (pow2)-1;	      	\
+	andi	rt, rt, -(pow2);
+
+/*
+ * ----------------------------------------------------------------
+ * Splat - replicate a particular slot into all slots
+ * Altivec analogs...
+ * ----------------------------------------------------------------
+ */
+
+// replicate byte from slot s [0,15]
+#define VSPLTB(rt, ra, s) \
+	ilh	_gc_t0, (s)*0x0101;	        \
+	shufb	rt, ra, ra, _gc_t0;
+
+// replicate halfword from slot s [0,7]
+#define	VSPLTH(rt, ra, s) \
+	ilh	_gc_t0, 2*(s)*0x0101 + 0x0001; 	\
+	shufb	rt, ra, ra, _gc_t0;
+
+// replicate word from slot s [0,3]
+#define VSPLTW(rt, ra, s) \
+	iluh	_gc_t0, 4*(s)*0x0101 + 0x0001;	\
+	iohl	_gc_t0, 4*(s)*0x0101 + 0x0203;	\
+	shufb	rt, ra, ra, _gc_t0;
+	
+// replicate double from slot s [0,1]
+#define	VSPLTD(rt, ra, s) \
+	/* sp is always 16-byte aligned */ \
+	cdd	_gc_t0, 8(sp);		/* 0x10111213 14151617 00010203 04050607 */ \
+	rotqbyi	rt, ra, ra, (s) << 3;	/* rotate double into preferred slot 	 */ \
+	shufb	rt, rt, rt, _gc_t0;
+
+/*
+ * ----------------------------------------------------------------
+ * lots of min/max variations...
+ *
+ * On a slot by slot basis, compute the min or max
+ *
+ * U - unsigned, else signed
+ * B,H,{} - byte, halfword, word
+ * F float
+ * ----------------------------------------------------------------
+ */
+
+#define MIN_SELB(rt, ra, rb, rc)	selb	rt, ra, rb, rc;
+#define MAX_SELB(rt, ra, rb, rc)	selb	rt, rb, ra, rc;
+	
+	// words
+
+#define MIN(rt, ra, rb) \
+	cgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAX(rt, ra, rb) \
+	cgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMIN(rt, ra, rb) \
+	clgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAX(rt, ra, rb) \
+	clgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// bytes
+	
+#define MINB(rt, ra, rb) \
+	cgtb	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAXB(rt, ra, rb) \
+	cgtb	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINB(rt, ra, rb) \
+	clgtb	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAXB(rt, ra, rb) \
+	clgtb	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// halfwords
+	
+#define MINH(rt, ra, rb) \
+	cgth	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	MAXH(rt, ra, rb) \
+	cgth	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINH(rt, ra, rb) \
+	clgth	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	UMAXH(rt, ra, rb) \
+	clgth	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+	// floats
+	
+#define FMIN(rt, ra, rb) \
+	fcgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+#define	FMAX(rt, ra, rb) \
+	fcgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the minimum magnitude
+#define FMINMAG(rt, ra, rb) \
+	fcmgt	_gc_t0, ra, rb; \
+	MIN_SELB(rt, ra, rb, _gc_t0)
+	
+// Ignoring the sign, select the values with the maximum magnitude
+#define	FMAXMAG(rt, ra, rb) \
+	fcmgt	_gc_t0, ra, rb; \
+	MAX_SELB(rt, ra, rb, _gc_t0)
+
+
+#endif /* INCLUDED_GC_SPU_MACS_H */
diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
new file mode 100644
index 000000000..765cacd9a
--- /dev/null
+++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
@@ -0,0 +1,160 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+	vector signed short vec_val = spu_splats(val);
+	vector unsigned short compare;
+	vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1};
+	vector unsigned short after_and;
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+		compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+		after_and = spu_and(compare, ones);
+		
+		
+		out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	
+	compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+	after_and = spu_and(compare, ones);
+		
+
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+
+	int i = 0;
+	for(i = 0; i < 48; i += 2){
+		bear[i] = i;
+		bear[i + 1] = -i;
+	}
+
+	vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short));
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+*/	
+
diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
new file mode 100644
index 000000000..a3ce6c2fe
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_subtract_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+    vector signed short sum;
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+	signed short res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = i;
+	}
+
+	vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short));
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", res[i]);
+	}
+	printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
new file mode 100644
index 000000000..5a1cb9aaf
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_sum_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+    vector signed int sum;
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	signed short pooh[48];
+	signed short bear[48];
+	signed short res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = i;
+	}
+
+	vector_sum(&pooh[9], &pooh[9], &bear[3], 30);
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
new file mode 100644
index 000000000..58fd4aa0c
--- /dev/null
+++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
@@ -0,0 +1,222 @@
+#include<spu_intrinsics.h>
+
+
+
+
+void* libvector_pointwise_multiply_32fc_unaligned(void* target,  void* src0, void* src1, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+	vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+	//eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	sixteen_uchar = spu_splats((unsigned char)16);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+	
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src0_past;
+	qword src0_present;
+	qword src1_past;
+	qword src1_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp0;
+	qword in_temp1;
+	qword out_temp0;
+	qword out_temp1;
+
+
+	src0_past = si_lqd((qword)address_counter_src0, 0);
+	src1_past = si_lqd((qword)address_counter_src1, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
+													  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
+	vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+													  0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
+	vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+													  0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+	vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+										 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+	
+	vector float prod0;
+	qword shuf0;
+	vector float prod1;
+	vector float sign_change;
+	qword summand0;
+	qword summand1;
+	vector float sum;
+	
+
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src0_present = si_lqd((qword)address_counter_src0, 16);
+		src1_present = si_lqd((qword)address_counter_src1, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+		in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+		
+		prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+		shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+		prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
+		sign_change = spu_xor(prod0, (vector float)sign_changer);
+		
+		summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+			   
+		summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+		
+		sum = spu_add((vector float)summand0, (vector float)summand1);
+	   
+
+		out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+		
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src0_past = src0_present;
+		src1_past = src1_present;
+		address_counter_src0 = spu_add(address_counter_src0, 16);
+		address_counter_src1 = spu_add(address_counter_src1, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+		
+		
+	}
+	
+	src0_present = si_lqd((qword)address_counter_src0, 16);
+	src1_present = si_lqd((qword)address_counter_src1, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+	
+	
+	in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+	in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+	
+	
+	prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+	shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+	prod1 = spu_mul(prod0, (vector float)shuf0);
+	sign_change = spu_xor(prod0, (vector float)sign_changer);
+	summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+	summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+	sum = spu_add((vector float)summand0, (vector float)summand1);
+	
+	
+	
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+	
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+int main(){
+
+	float pooh[48];
+	float bear[48];
+	float res[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = (float) i;
+	}
+	for(i = 48; i < 96; ++i){
+		bear[i - 48] = (float) i;
+	}
+
+	vector_product_complex(res, pooh, bear, 48*sizeof(float));
+
+	
+
+	for(i = 0; i < 48; ++i) {
+		printf("%f, ", res[i]);
+	}
+	printf("\n");
+
+	
+}
+*/
+
diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c
new file mode 100644
index 000000000..2a0dabcd7
--- /dev/null
+++ b/volk/spu_lib/spu_memcpy_unaligned.c
@@ -0,0 +1,290 @@
+#include<libvector/libvector_memcpy_unaligned.h
+#include<spu_intrinsics.h>
+
+void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = target;
+	
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+	
+	//alpha: first half of first, second half of second, break at (unsigned int)target%16
+	src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at (unsigned int)target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+		
+		out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+	
+	return retval;
+}
+
+
+
+/*
+void* mcpy(void* target, void* src, size_t num_bytes){
+	//loop iterator i
+	int i = 0;
+	void* retval = src;
+
+	//put the target and source addresses into qwords
+	vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+	vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+	
+	//create shuffle masks
+	
+	//shuffle mask building blocks:
+	//all from the first vector
+	vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+								  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+	//all from the second vector
+	vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+										 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+	
+
+	
+	//gamma: second half of the second, first half of the first, break at src%16
+	vector unsigned char src_cmp = spu_splats((unsigned char)(src%16));
+	vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+	vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+	vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+	vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+	vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, 
+												 (vector unsigned int)oneup);
+	shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16);
+
+	
+	
+
+	vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16));
+	vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16));
+	
+	//alpha: first half of first, second half of second, break at target%16
+	src_cmp = spu_splats((unsigned char)(target%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	//delta: first half of first, first half of second, break at target%16
+	vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+	//epsilon: second half of second, second half of first, break at target%16
+	vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+	//zeta: second half of second, first half of first, break at 16 - target%16
+	vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); 
+
+	//beta: first half of first, second half of second, break at num_bytes%16
+	src_cmp = spu_splats((unsigned char)(num_bytes%16));
+	gt_res = spu_cmpgt(oneup, src_cmp);
+	eq_res = spu_cmpeq(oneup, src_cmp);
+	cmp_res = spu_or(gt_res, eq_res);
+	phase_change = spu_and(sixteen_uchar, cmp_res);
+	vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+													 (vector unsigned int)oneup);
+	
+	
+	  printf("num_bytesmod16 %d\n", num_bytes%16);
+	printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", 
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 0),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 1),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 2),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 3),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 4),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 5),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 6),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 7),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 8),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 9),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 10),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 11),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 12),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 13),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 14),
+		   spu_extract((vector unsigned char) shuffle_mask_beta, 15));
+	
+
+	
+	
+
+
+	
+	qword src_past;
+	qword src_present;
+	qword tgt_past;
+	qword tgt_present;
+	
+	qword in_temp;
+	qword out_temp0;
+	qword out_temp1;
+
+	src_past = si_lqd((qword)address_counter_src, 0);
+	tgt_past = si_lqd((qword)address_counter_tgt, 0);
+	
+	for(i = 0; i < num_bytes/16; ++i) {
+		
+		src_present = si_lqd((qword)address_counter_src, 16);
+		tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+		in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+		
+		out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+		out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+		si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+		si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+		
+		tgt_past = out_temp1;
+		src_past = src_present;
+		address_counter_src = spu_add(address_counter_src, 16);
+		address_counter_tgt = spu_add(address_counter_tgt, 16);
+	
+
+	}
+	
+	src_present = si_lqd((qword)address_counter_src, 16);
+	tgt_present = si_lqd((qword)address_counter_tgt, 16);
+		
+	
+	in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+	qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+	qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+	
+	
+	out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+	out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+	
+	si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+	si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+	return retval;
+	   
+}
+*/
+/*
+int main(){
+
+	unsigned char pooh[48];
+	unsigned char bear[48];
+
+	int i = 0;
+	for(i = 0; i < 48; ++i){
+		pooh[i] = i;
+		bear[i] = i;
+	}
+
+	spu_mcpy(&pooh[9],&bear[3], 15);
+
+	for(i = 0; i < 48; ++i) {
+		printf("%d, ", pooh[i]);
+	}
+	printf("\n");
+}
+	
+*/
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
new file mode 100644
index 000000000..a655c4c52
--- /dev/null
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -0,0 +1,185 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ * 
+ * This file is part of GNU Radio
+ * 
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ * 
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gc_spu_macs.h"
+
+	.file "spu_memset_unaligned.S"
+
+	/*
+	 * Computes this, only a lot faster...
+	 *
+	 *	void *
+	 *	libvector_memset_unaligned(void *pv, int c, size_t n)
+	 *	{
+	 *	  unsigned char *p = (unsigned char *) pv;
+	 *	  size_t i;
+	 *	  for (i = 0; i < n; i++)
+	 *	    p[i] = c;
+	 *	
+	 *	  return pv;
+	 *	}
+	 */
+	
+#define	p_arg	arg1	// we're going to clobber arg1 w/ the return value
+#define	c	arg2	// the constant we're writing
+#define	n	arg3	// how many bytes to write
+
+#define	p	r13	// where we're writing
+#define	t0	r14
+#define t1	r15
+#define	mask	r16
+#define	old	r17
+#define an	r18	// aligned n (n rounded down to mod 16 boundary)
+#define	next_p	r19
+#define	cond1	r20
+#define	cond2	r21				
+#define m	r22
+#define r	r23
+	
+	PROC_ENTRY(libvector_memset_unaligned)
+	
+	// Hint the return from do_head, in case we go that way.
+	// There's pretty much nothing to can do to hint the branch to it.
+	hbrr	do_head_br, head_complete
+	
+	MR(p, p_arg)	// leaves p, the return value, in the correct reg (r3)
+	BRZ_RETURN(n)
+
+	MODULO(t0, p, 16)	// is p%16 == 0?
+	VSPLTB(c, c, 3)		// splat byte in preferred slot of c into all slots
+	brnz	t0, do_head	// no, handle it
+head_complete:
+
+	/*
+	 * preconditions:	
+	 *   p%16 == 0, n > 0
+	 */
+	hbrr	middle_loop_br, middle_loop
+	
+	ROUND_DOWN(an, n, 16)	// an is "aligned n"
+	MODULO(n, n, 16)	// what's left over in the last quad
+	brz	an, do_tail	// no whole quad words; skip to tail
+	clgti	t0, an, 127	// an >= 128?
+	brz	t0, middle2	// nope, go handle the cases between 0 and 112
+
+	/*
+	 * 128 bytes / iteration
+	 */
+	.p2align 4
+middle_loop:
+	ai	an, an, -128
+	  stqd	c,  0*16(p)
+	ai	next_p, p, 128
+	  stqd	c,  1*16(p)
+	cgti	cond1, an, 127
+	  stqd	c,  2*16(p)
+
+	  stqd	c,  3*16(p)
+	  stqd	c,  4*16(p)
+	  stqd	c,  5*16(p)
+	  stqd	c,  6*16(p)
+	
+	MR(p, next_p)
+	  stqd	c,  7*16-128(next_p)
+	or	cond2, n, an
+middle_loop_br:
+	  brnz	cond1, middle_loop
+	
+	/*
+	 * if an and n are both zero, return now 
+	 */
+	BRZ_RETURN(cond2)
+
+	/*
+	 * otherwise handle last of full quad words 
+	 *
+	 *   0 <= an < 128, p%16 == 0
+	 */
+middle2:
+	/*
+	 * if an == 0, go handle the final non-full quadword
+	 */
+	brz	an, do_tail
+	hbrr	middle2_loop_br, middle2_loop
+	
+	.p2align 3
+middle2_loop:	
+	ai	next_p, p, 16
+	  stqd	c, 0(p)
+	ai	an, an, -16
+	  LMR(p, next_p)
+middle2_loop_br:
+	  brnz	an, middle2_loop
+	
+	/* We're done with the full quadwords. */
+	
+	/*
+	 * Handle the final partial quadword.
+	 * We'll be modifying only the left hand portion of the quad.
+	 *
+	 * preconditions:
+	 *   an == 0, 0 <= n < 16, p%16 == 0
+	 */
+do_tail:
+	HINT_RETURN(do_tail_ret)
+	il	mask, -1
+	sfi	t1, n, 16		// t1 = 16 - n
+	lqd	old, 0(p)
+	shlqby  mask, mask, t1
+	selb	t0, old, c, mask
+	stqd	t0, 0(p)
+do_tail_ret:	
+	RETURN()
+
+	/*
+	 * ----------------------------------------------------------------
+	 * Handle the first partial quadword
+	 *
+	 * preconditions:
+	 *   p%16 != 0
+	 *
+         * postconditions:
+         *   p%16 == 0 or n == 0
+         *
+         *        |-- m --|
+         *     +----------------+----------------+
+         *     |  ////////      |                |
+         *     +----------------+----------------+
+         *        |----- r -----|
+         *        p
+         * ----------------------------------------------------------------
+	 */
+do_head:
+	lqd	old, 0(p)
+	MODULO_NEG(r, p, 16)
+	il	mask, -1
+	UMIN(m, r, n)
+	shlqby	mask, mask, m	// 1's in the top, m*8 0's in the bottom
+	MR(t1, p)
+	sf	t0, m, r	// t0 = r - m
+	a	p, p, m		// p += m
+	rotqby	mask, mask, t0	// rotate 0's to the right place	
+	sf	n, m, n		// n -= m
+	selb	t0, c, old, mask // merge
+	stqd	t0, 0(t1)
+	BRZ_RETURN(n)
+do_head_br:
+	br	head_complete