summaryrefslogtreecommitdiff
path: root/volk/spu_lib
diff options
context:
space:
mode:
authorManoj Gudi2013-10-07 20:19:55 +0530
committerManoj Gudi2013-10-07 20:20:35 +0530
commit1826d0763c8595997f5f4af1fdb0354e9c0998ad (patch)
treeacbd852cd5a1bf17241b1038b5e37a0e72e64612 /volk/spu_lib
parent452defdb4a78e9e826740ddf4b9673e926c568a4 (diff)
parent24b640997ba7fee0c725e65f401f5cbebdab8d08 (diff)
downloadgnuradio-1826d0763c8595997f5f4af1fdb0354e9c0998ad.tar.gz
gnuradio-1826d0763c8595997f5f4af1fdb0354e9c0998ad.tar.bz2
gnuradio-1826d0763c8595997f5f4af1fdb0354e9c0998ad.zip
README change
Diffstat (limited to 'volk/spu_lib')
-rw-r--r--volk/spu_lib/gc_spu_macs.h380
-rw-r--r--volk/spu_lib/spu_16s_cmpgt_unaligned.c160
-rw-r--r--volk/spu_lib/spu_16s_vector_subtract_unaligned.c178
-rw-r--r--volk/spu_lib/spu_16s_vector_sum_unaligned.c178
-rw-r--r--volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c222
-rw-r--r--volk/spu_lib/spu_memcpy_unaligned.c290
-rw-r--r--volk/spu_lib/spu_memset_unaligned.S185
7 files changed, 1593 insertions, 0 deletions
diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h
new file mode 100644
index 000000000..e86dce3f5
--- /dev/null
+++ b/volk/spu_lib/gc_spu_macs.h
@@ -0,0 +1,380 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef INCLUDED_GC_SPU_MACS_H
+#define INCLUDED_GC_SPU_MACS_H
+
+/*
+ * This file contains a set of macros that are generally useful when
+ * coding in SPU assembler
+ *
+ * Note that the multi-instruction macros in here may overwrite
+ * registers 77, 78, and 79 without warning.
+ */
+
+/*
+ * defines for all registers
+ */
+#define r0 $0
+#define r1 $1
+#define r2 $2
+#define r3 $3
+#define r4 $4
+#define r5 $5
+#define r6 $6
+#define r7 $7
+#define r8 $8
+#define r9 $9
+#define r10 $10
+#define r11 $11
+#define r12 $12
+#define r13 $13
+#define r14 $14
+#define r15 $15
+#define r16 $16
+#define r17 $17
+#define r18 $18
+#define r19 $19
+#define r20 $20
+#define r21 $21
+#define r22 $22
+#define r23 $23
+#define r24 $24
+#define r25 $25
+#define r26 $26
+#define r27 $27
+#define r28 $28
+#define r29 $29
+#define r30 $30
+#define r31 $31
+#define r32 $32
+#define r33 $33
+#define r34 $34
+#define r35 $35
+#define r36 $36
+#define r37 $37
+#define r38 $38
+#define r39 $39
+#define r40 $40
+#define r41 $41
+#define r42 $42
+#define r43 $43
+#define r44 $44
+#define r45 $45
+#define r46 $46
+#define r47 $47
+#define r48 $48
+#define r49 $49
+#define r50 $50
+#define r51 $51
+#define r52 $52
+#define r53 $53
+#define r54 $54
+#define r55 $55
+#define r56 $56
+#define r57 $57
+#define r58 $58
+#define r59 $59
+#define r60 $60
+#define r61 $61
+#define r62 $62
+#define r63 $63
+#define r64 $64
+#define r65 $65
+#define r66 $66
+#define r67 $67
+#define r68 $68
+#define r69 $69
+#define r70 $70
+#define r71 $71
+#define r72 $72
+#define r73 $73
+#define r74 $74
+#define r75 $75
+#define r76 $76
+#define r77 $77
+#define r78 $78
+#define r79 $79
+#define r80 $80
+#define r81 $81
+#define r82 $82
+#define r83 $83
+#define r84 $84
+#define r85 $85
+#define r86 $86
+#define r87 $87
+#define r88 $88
+#define r89 $89
+#define r90 $90
+#define r91 $91
+#define r92 $92
+#define r93 $93
+#define r94 $94
+#define r95 $95
+#define r96 $96
+#define r97 $97
+#define r98 $98
+#define r99 $99
+#define r100 $100
+#define r101 $101
+#define r102 $102
+#define r103 $103
+#define r104 $104
+#define r105 $105
+#define r106 $106
+#define r107 $107
+#define r108 $108
+#define r109 $109
+#define r110 $110
+#define r111 $111
+#define r112 $112
+#define r113 $113
+#define r114 $114
+#define r115 $115
+#define r116 $116
+#define r117 $117
+#define r118 $118
+#define r119 $119
+#define r120 $120
+#define r121 $121
+#define r122 $122
+#define r123 $123
+#define r124 $124
+#define r125 $125
+#define r126 $126
+#define r127 $127
+
+
+#define lr r0 // link register
+#define sp r1 // stack pointer
+ // r2 is environment pointer for langs that need it (ALGOL)
+
+#define retval r3 // return values are passed in regs starting at r3
+
+#define arg1 r3 // args are passed in regs starting at r3
+#define arg2 r4
+#define arg3 r5
+#define arg4 r6
+#define arg5 r7
+#define arg6 r8
+#define arg7 r9
+#define arg8 r10
+#define arg9 r11
+#define arg10 r12
+
+// r3 - r74 are volatile (caller saves)
+// r74 - r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog)
+// r80 - r127 are non-volatile (caller-saves)
+
+// scratch registers reserved for use by the macros in this file.
+
+#define _gc_t0 r79
+#define _gc_t1 r78
+#define _gc_t2 r77
+
+/*
+ * ----------------------------------------------------------------
+ * pseudo ops
+ * ----------------------------------------------------------------
+ */
+#define PROC_ENTRY(name) \
+ .text; \
+ .p2align 4; \
+ .global name; \
+ .type name, @function; \
+name:
+
+/*
+ * ----------------------------------------------------------------
+ * aliases for common operations
+ * ----------------------------------------------------------------
+ */
+
+// Move register (even pipe, 2 cycles)
+#define MR(rt, ra) or rt, ra, ra;
+
+// Move register (odd pipe, 4 cycles)
+#define LMR(rt, ra) rotqbyi rt, ra, 0;
+
+// return
+#define RETURN() bi lr;
+
+// hint for a return
+#define HINT_RETURN(ret_label) hbr ret_label, lr;
+
+// return if zero
+#define BRZ_RETURN(rt) biz rt, lr;
+
+// return if not zero
+#define BRNZ_RETURN(rt) binz rt, lr;
+
+// return if halfword zero
+#define BRHZ_RETURN(rt) bihz rt, lr;
+
+// return if halfword not zero
+#define BRHNZ_RETURN(rt) bihnz rt, lr;
+
+
+/*
+ * ----------------------------------------------------------------
+ * modulo like things for constant moduli that are powers of 2
+ * ----------------------------------------------------------------
+ */
+
+// rt = ra & (pow2 - 1)
+#define MODULO(rt, ra, pow2) \
+ andi rt, ra, (pow2)-1;
+
+// rt = pow2 - (ra & (pow2 - 1))
+#define MODULO_NEG(rt, ra, pow2) \
+ andi rt, ra, (pow2)-1; \
+ sfi rt, rt, (pow2);
+
+// rt = ra & -(pow2)
+#define ROUND_DOWN(rt, ra, pow2) \
+ andi rt, ra, -(pow2);
+
+// rt = (ra + (pow2 - 1)) & -(pow2)
+#define ROUND_UP(rt, ra, pow2) \
+ ai rt, ra, (pow2)-1; \
+ andi rt, rt, -(pow2);
+
+/*
+ * ----------------------------------------------------------------
+ * Splat - replicate a particular slot into all slots
+ * Altivec analogs...
+ * ----------------------------------------------------------------
+ */
+
+// replicate byte from slot s [0,15]
+#define VSPLTB(rt, ra, s) \
+ ilh _gc_t0, (s)*0x0101; \
+ shufb rt, ra, ra, _gc_t0;
+
+// replicate halfword from slot s [0,7]
+#define VSPLTH(rt, ra, s) \
+ ilh _gc_t0, 2*(s)*0x0101 + 0x0001; \
+ shufb rt, ra, ra, _gc_t0;
+
+// replicate word from slot s [0,3]
+#define VSPLTW(rt, ra, s) \
+ iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \
+ iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \
+ shufb rt, ra, ra, _gc_t0;
+
+// replicate double from slot s [0,1]
+#define VSPLTD(rt, ra, s) \
+ /* sp is always 16-byte aligned */ \
+ cdd _gc_t0, 8(sp); /* 0x10111213 14151617 00010203 04050607 */ \
+ rotqbyi rt, ra, ra, (s) << 3; /* rotate double into preferred slot */ \
+ shufb rt, rt, rt, _gc_t0;
+
+/*
+ * ----------------------------------------------------------------
+ * lots of min/max variations...
+ *
+ * On a slot by slot basis, compute the min or max
+ *
+ * U - unsigned, else signed
+ * B,H,{} - byte, halfword, word
+ * F float
+ * ----------------------------------------------------------------
+ */
+
+#define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc;
+#define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc;
+
+ // words
+
+#define MIN(rt, ra, rb) \
+ cgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define MAX(rt, ra, rb) \
+ cgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMIN(rt, ra, rb) \
+ clgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define UMAX(rt, ra, rb) \
+ clgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+ // bytes
+
+#define MINB(rt, ra, rb) \
+ cgtb _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define MAXB(rt, ra, rb) \
+ cgtb _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINB(rt, ra, rb) \
+ clgtb _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define UMAXB(rt, ra, rb) \
+ clgtb _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+ // halfwords
+
+#define MINH(rt, ra, rb) \
+ cgth _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define MAXH(rt, ra, rb) \
+ cgth _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+#define UMINH(rt, ra, rb) \
+ clgth _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define UMAXH(rt, ra, rb) \
+ clgth _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+ // floats
+
+#define FMIN(rt, ra, rb) \
+ fcgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+#define FMAX(rt, ra, rb) \
+ fcgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the minimum magnitude
+#define FMINMAG(rt, ra, rb) \
+ fcmgt _gc_t0, ra, rb; \
+ MIN_SELB(rt, ra, rb, _gc_t0)
+
+// Ignoring the sign, select the values with the maximum magnitude
+#define FMAXMAG(rt, ra, rb) \
+ fcmgt _gc_t0, ra, rb; \
+ MAX_SELB(rt, ra, rb, _gc_t0)
+
+
+#endif /* INCLUDED_GC_SPU_MACS_H */
diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
new file mode 100644
index 000000000..8811e6801
--- /dev/null
+++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
@@ -0,0 +1,160 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src_past;
+ qword src_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp;
+ qword out_temp0;
+ qword out_temp1;
+
+ src_past = si_lqd((qword)address_counter_src, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ vector signed short vec_val = spu_splats(val);
+ vector unsigned short compare;
+ vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1};
+ vector unsigned short after_and;
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+ compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+ after_and = spu_and(compare, ones);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src_past = src_present;
+ address_counter_src = spu_add(address_counter_src, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+
+ compare = spu_cmpgt((vector signed short) in_temp, vec_val);
+ after_and = spu_and(compare, ones);
+
+
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ signed short pooh[48];
+ signed short bear[48];
+
+ int i = 0;
+ for(i = 0; i < 48; i += 2){
+ bear[i] = i;
+ bear[i + 1] = -i;
+ }
+
+ vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short));
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", pooh[i]);
+ }
+ printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
new file mode 100644
index 000000000..ea110c8d2
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+ vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+ //eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ sixteen_uchar = spu_splats((unsigned char)16);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src0_past;
+ qword src0_present;
+ qword src1_past;
+ qword src1_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp0;
+ qword in_temp1;
+ qword out_temp0;
+ qword out_temp1;
+
+ vector signed short sum;
+
+ src0_past = si_lqd((qword)address_counter_src0, 0);
+ src1_past = si_lqd((qword)address_counter_src1, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+
+ sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src0_past = src0_present;
+ src1_past = src1_present;
+ address_counter_src0 = spu_add(address_counter_src0, 16);
+ address_counter_src1 = spu_add(address_counter_src1, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+ sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ signed short pooh[48];
+ signed short bear[48];
+ signed short res[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = i;
+ }
+ for(i = 48; i < 96; ++i){
+ bear[i - 48] = i;
+ }
+
+ vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short));
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", res[i]);
+ }
+ printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
new file mode 100644
index 000000000..0097b4f56
--- /dev/null
+++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
@@ -0,0 +1,178 @@
+#include<spu_intrinsics.h>
+
+void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+ vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+ //eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ sixteen_uchar = spu_splats((unsigned char)16);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src0_past;
+ qword src0_present;
+ qword src1_past;
+ qword src1_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp0;
+ qword in_temp1;
+ qword out_temp0;
+ qword out_temp1;
+
+ vector signed int sum;
+
+ src0_past = si_lqd((qword)address_counter_src0, 0);
+ src1_past = si_lqd((qword)address_counter_src1, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+
+ sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src0_past = src0_present;
+ src1_past = src1_present;
+ address_counter_src0 = spu_add(address_counter_src0, 16);
+ address_counter_src1 = spu_add(address_counter_src1, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+ sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ signed short pooh[48];
+ signed short bear[48];
+ signed short res[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = i;
+ }
+ for(i = 48; i < 96; ++i){
+ bear[i - 48] = i;
+ }
+
+ vector_sum(&pooh[9], &pooh[9], &bear[3], 30);
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", pooh[i]);
+ }
+ printf("\n");
+}
+*/
+
diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
new file mode 100644
index 000000000..d1c960488
--- /dev/null
+++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
@@ -0,0 +1,222 @@
+#include<spu_intrinsics.h>
+
+
+
+
+void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
+ vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
+
+ //eta: second half of the second, first half of the first, break at (unsigned int)src1%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)src1%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ sixteen_uchar = spu_splats((unsigned char)16);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
+
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src0_past;
+ qword src0_present;
+ qword src1_past;
+ qword src1_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp0;
+ qword in_temp1;
+ qword out_temp0;
+ qword out_temp1;
+
+
+ src0_past = si_lqd((qword)address_counter_src0, 0);
+ src1_past = si_lqd((qword)address_counter_src1, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
+ vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+ 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b};
+ vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+ vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+
+ vector float prod0;
+ qword shuf0;
+ vector float prod1;
+ vector float sign_change;
+ qword summand0;
+ qword summand1;
+ vector float sum;
+
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
+
+ prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+ shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+ prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
+ sign_change = spu_xor(prod0, (vector float)sign_changer);
+
+ summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+
+ summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+
+ sum = spu_add((vector float)summand0, (vector float)summand1);
+
+
+ out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src0_past = src0_present;
+ src1_past = src1_present;
+ address_counter_src0 = spu_add(address_counter_src0, 16);
+ address_counter_src1 = spu_add(address_counter_src1, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src0_present = si_lqd((qword)address_counter_src0, 16);
+ src1_present = si_lqd((qword)address_counter_src1, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
+ in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
+
+
+ prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
+ shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
+ prod1 = spu_mul(prod0, (vector float)shuf0);
+ sign_change = spu_xor(prod0, (vector float)sign_changer);
+ summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
+ summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
+ sum = spu_add((vector float)summand0, (vector float)summand1);
+
+
+
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+int main(){
+
+ float pooh[48];
+ float bear[48];
+ float res[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = (float) i;
+ }
+ for(i = 48; i < 96; ++i){
+ bear[i - 48] = (float) i;
+ }
+
+ vector_product_complex(res, pooh, bear, 48*sizeof(float));
+
+
+
+ for(i = 0; i < 48; ++i) {
+ printf("%f, ", res[i]);
+ }
+ printf("\n");
+
+
+}
+*/
+
diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c
new file mode 100644
index 000000000..0f15b5d80
--- /dev/null
+++ b/volk/spu_lib/spu_memcpy_unaligned.c
@@ -0,0 +1,290 @@
+#include<libvector/libvector_memcpy_unaligned.h
+#include<spu_intrinsics.h>
+
+void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = target;
+
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at (unsigned int)src%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
+
+ //alpha: first half of first, second half of second, break at (unsigned int)target%16
+ src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at (unsigned int)target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+
+
+
+
+ qword src_past;
+ qword src_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp;
+ qword out_temp0;
+ qword out_temp1;
+
+ src_past = si_lqd((qword)address_counter_src, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+ out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src_past = src_present;
+ address_counter_src = spu_add(address_counter_src, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+}
+
+
+
+/*
+void* mcpy(void* target, void* src, size_t num_bytes){
+ //loop iterator i
+ int i = 0;
+ void* retval = src;
+
+ //put the target and source addresses into qwords
+ vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
+ vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
+
+ //create shuffle masks
+
+ //shuffle mask building blocks:
+ //all from the first vector
+ vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+ //all from the second vector
+ vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
+
+
+
+ //gamma: second half of the second, first half of the first, break at src%16
+ vector unsigned char src_cmp = spu_splats((unsigned char)(src%16));
+ vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
+ vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp);
+ vector unsigned char cmp_res = spu_or(gt_res, eq_res);
+ vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
+ vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+ shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16);
+
+
+
+
+ vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16));
+ vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16));
+
+ //alpha: first half of first, second half of second, break at target%16
+ src_cmp = spu_splats((unsigned char)(target%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+ //delta: first half of first, first half of second, break at target%16
+ vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
+ //epsilon: second half of second, second half of first, break at target%16
+ vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
+ //zeta: second half of second, first half of first, break at 16 - target%16
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16);
+
+ //beta: first half of first, second half of second, break at num_bytes%16
+ src_cmp = spu_splats((unsigned char)(num_bytes%16));
+ gt_res = spu_cmpgt(oneup, src_cmp);
+ eq_res = spu_cmpeq(oneup, src_cmp);
+ cmp_res = spu_or(gt_res, eq_res);
+ phase_change = spu_and(sixteen_uchar, cmp_res);
+ vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
+ (vector unsigned int)oneup);
+
+
+ printf("num_bytesmod16 %d\n", num_bytes%16);
+ printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
+ spu_extract((vector unsigned char) shuffle_mask_beta, 0),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 1),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 2),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 3),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 4),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 5),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 6),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 7),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 8),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 9),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 10),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 11),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 12),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 13),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 14),
+ spu_extract((vector unsigned char) shuffle_mask_beta, 15));
+
+
+
+
+
+
+
+ qword src_past;
+ qword src_present;
+ qword tgt_past;
+ qword tgt_present;
+
+ qword in_temp;
+ qword out_temp0;
+ qword out_temp1;
+
+ src_past = si_lqd((qword)address_counter_src, 0);
+ tgt_past = si_lqd((qword)address_counter_tgt, 0);
+
+ for(i = 0; i < num_bytes/16; ++i) {
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+ in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
+
+ out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ tgt_past = out_temp1;
+ src_past = src_present;
+ address_counter_src = spu_add(address_counter_src, 16);
+ address_counter_tgt = spu_add(address_counter_tgt, 16);
+
+
+ }
+
+ src_present = si_lqd((qword)address_counter_src, 16);
+ tgt_present = si_lqd((qword)address_counter_tgt, 16);
+
+
+ in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
+ qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
+ qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
+
+
+
+ out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
+ out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
+
+ si_stqd(out_temp0, (qword)address_counter_tgt, 0);
+ si_stqd(out_temp1, (qword)address_counter_tgt, 16);
+
+ return retval;
+
+}
+*/
+/*
+int main(){
+
+ unsigned char pooh[48];
+ unsigned char bear[48];
+
+ int i = 0;
+ for(i = 0; i < 48; ++i){
+ pooh[i] = i;
+ bear[i] = i;
+ }
+
+ spu_mcpy(&pooh[9],&bear[3], 15);
+
+ for(i = 0; i < 48; ++i) {
+ printf("%d, ", pooh[i]);
+ }
+ printf("\n");
+}
+
+*/
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
new file mode 100644
index 000000000..c260a125c
--- /dev/null
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -0,0 +1,185 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gc_spu_macs.h"
+
+ .file "spu_memset_unaligned.S"
+
+ /*
+ * Computes this, only a lot faster...
+ *
+ * void *
+ * libvector_memset_unaligned(void *pv, int c, size_t n)
+ * {
+ * unsigned char *p = (unsigned char *) pv;
+ * size_t i;
+ * for (i = 0; i < n; i++)
+ * p[i] = c;
+ *
+ * return pv;
+ * }
+ */
+
+#define p_arg arg1 // we're going to clobber arg1 w/ the return value
+#define c arg2 // the constant we're writing
+#define n arg3 // how many bytes to write
+
+#define p r13 // where we're writing
+#define t0 r14
+#define t1 r15
+#define mask r16
+#define old r17
+#define an r18 // aligned n (n rounded down to mod 16 boundary)
+#define next_p r19
+#define cond1 r20
+#define cond2 r21
+#define m r22
+#define r r23
+
+ PROC_ENTRY(libvector_memset_unaligned)
+
+ // Hint the return from do_head, in case we go that way.
+ // There's pretty much nothing to can do to hint the branch to it.
+ hbrr do_head_br, head_complete
+
+ MR(p, p_arg) // leaves p, the return value, in the correct reg (r3)
+ BRZ_RETURN(n)
+
+ MODULO(t0, p, 16) // is p%16 == 0?
+ VSPLTB(c, c, 3) // splat byte in preferred slot of c into all slots
+ brnz t0, do_head // no, handle it
+head_complete:
+
+ /*
+ * preconditions:
+ * p%16 == 0, n > 0
+ */
+ hbrr middle_loop_br, middle_loop
+
+ ROUND_DOWN(an, n, 16) // an is "aligned n"
+ MODULO(n, n, 16) // what's left over in the last quad
+ brz an, do_tail // no whole quad words; skip to tail
+ clgti t0, an, 127 // an >= 128?
+ brz t0, middle2 // nope, go handle the cases between 0 and 112
+
+ /*
+ * 128 bytes / iteration
+ */
+ .p2align 4
+middle_loop:
+ ai an, an, -128
+ stqd c, 0*16(p)
+ ai next_p, p, 128
+ stqd c, 1*16(p)
+ cgti cond1, an, 127
+ stqd c, 2*16(p)
+
+ stqd c, 3*16(p)
+ stqd c, 4*16(p)
+ stqd c, 5*16(p)
+ stqd c, 6*16(p)
+
+ MR(p, next_p)
+ stqd c, 7*16-128(next_p)
+ or cond2, n, an
+middle_loop_br:
+ brnz cond1, middle_loop
+
+ /*
+ * if an and n are both zero, return now
+ */
+ BRZ_RETURN(cond2)
+
+ /*
+ * otherwise handle last of full quad words
+ *
+ * 0 <= an < 128, p%16 == 0
+ */
+middle2:
+ /*
+ * if an == 0, go handle the final non-full quadword
+ */
+ brz an, do_tail
+ hbrr middle2_loop_br, middle2_loop
+
+ .p2align 3
+middle2_loop:
+ ai next_p, p, 16
+ stqd c, 0(p)
+ ai an, an, -16
+ LMR(p, next_p)
+middle2_loop_br:
+ brnz an, middle2_loop
+
+ /* We're done with the full quadwords. */
+
+ /*
+ * Handle the final partial quadword.
+ * We'll be modifying only the left hand portion of the quad.
+ *
+ * preconditions:
+ * an == 0, 0 <= n < 16, p%16 == 0
+ */
+do_tail:
+ HINT_RETURN(do_tail_ret)
+ il mask, -1
+ sfi t1, n, 16 // t1 = 16 - n
+ lqd old, 0(p)
+ shlqby mask, mask, t1
+ selb t0, old, c, mask
+ stqd t0, 0(p)
+do_tail_ret:
+ RETURN()
+
+ /*
+ * ----------------------------------------------------------------
+ * Handle the first partial quadword
+ *
+ * preconditions:
+ * p%16 != 0
+ *
+ * postconditions:
+ * p%16 == 0 or n == 0
+ *
+ * |-- m --|
+ * +----------------+----------------+
+ * | //////// | |
+ * +----------------+----------------+
+ * |----- r -----|
+ * p
+ * ----------------------------------------------------------------
+ */
+do_head:
+ lqd old, 0(p)
+ MODULO_NEG(r, p, 16)
+ il mask, -1
+ UMIN(m, r, n)
+ shlqby mask, mask, m // 1's in the top, m*8 0's in the bottom
+ MR(t1, p)
+ sf t0, m, r // t0 = r - m
+ a p, p, m // p += m
+ rotqby mask, mask, t0 // rotate 0's to the right place
+ sf n, m, n // n -= m
+ selb t0, c, old, mask // merge
+ stqd t0, 0(t1)
+ BRZ_RETURN(n)
+do_head_br:
+ br head_complete