From 239144659b29c0a5ecd83a34e0e57387a1060ed7 Mon Sep 17 00:00:00 2001 From: Tom Rondeau Date: Tue, 7 Dec 2010 18:50:28 -0500 Subject: Initial checkin for VOLK - Vector-Optimized Library of Kernels. This is a new SIMD library. It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time. --- volk/spu_lib/gc_spu_macs.h | 380 +++++++++++++++++++++ volk/spu_lib/spu_16s_cmpgt_unaligned.c | 160 +++++++++ volk/spu_lib/spu_16s_vector_subtract_unaligned.c | 178 ++++++++++ volk/spu_lib/spu_16s_vector_sum_unaligned.c | 178 ++++++++++ .../spu_32fc_pointwise_multiply_unaligned.c | 222 ++++++++++++ volk/spu_lib/spu_memcpy_unaligned.c | 290 ++++++++++++++++ volk/spu_lib/spu_memset_unaligned.S | 185 ++++++++++ 7 files changed, 1593 insertions(+) create mode 100644 volk/spu_lib/gc_spu_macs.h create mode 100644 volk/spu_lib/spu_16s_cmpgt_unaligned.c create mode 100644 volk/spu_lib/spu_16s_vector_subtract_unaligned.c create mode 100644 volk/spu_lib/spu_16s_vector_sum_unaligned.c create mode 100644 volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c create mode 100644 volk/spu_lib/spu_memcpy_unaligned.c create mode 100644 volk/spu_lib/spu_memset_unaligned.S (limited to 'volk/spu_lib') diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h new file mode 100644 index 000000000..8e3e3f2a6 --- /dev/null +++ b/volk/spu_lib/gc_spu_macs.h @@ -0,0 +1,380 @@ +/* -*- asm -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef INCLUDED_GC_SPU_MACS_H +#define INCLUDED_GC_SPU_MACS_H + +/* + * This file contains a set of macros that are generally useful when + * coding in SPU assembler + * + * Note that the multi-instruction macros in here may overwrite + * registers 77, 78, and 79 without warning. + */ + +/* + * defines for all registers + */ +#define r0 $0 +#define r1 $1 +#define r2 $2 +#define r3 $3 +#define r4 $4 +#define r5 $5 +#define r6 $6 +#define r7 $7 +#define r8 $8 +#define r9 $9 +#define r10 $10 +#define r11 $11 +#define r12 $12 +#define r13 $13 +#define r14 $14 +#define r15 $15 +#define r16 $16 +#define r17 $17 +#define r18 $18 +#define r19 $19 +#define r20 $20 +#define r21 $21 +#define r22 $22 +#define r23 $23 +#define r24 $24 +#define r25 $25 +#define r26 $26 +#define r27 $27 +#define r28 $28 +#define r29 $29 +#define r30 $30 +#define r31 $31 +#define r32 $32 +#define r33 $33 +#define r34 $34 +#define r35 $35 +#define r36 $36 +#define r37 $37 +#define r38 $38 +#define r39 $39 +#define r40 $40 +#define r41 $41 +#define r42 $42 +#define r43 $43 +#define r44 $44 +#define r45 $45 +#define r46 $46 +#define r47 $47 +#define r48 $48 +#define r49 $49 +#define r50 $50 +#define r51 $51 +#define r52 $52 +#define r53 $53 +#define r54 $54 +#define r55 $55 +#define r56 $56 +#define r57 $57 +#define r58 $58 +#define r59 $59 +#define r60 $60 +#define r61 $61 +#define r62 $62 +#define r63 $63 +#define r64 $64 +#define r65 $65 +#define r66 $66 +#define r67 $67 +#define r68 $68 +#define r69 $69 +#define r70 $70 +#define r71 $71 +#define r72 $72 +#define r73 $73 +#define r74 $74 +#define r75 $75 +#define r76 $76 +#define r77 $77 +#define r78 $78 +#define r79 $79 +#define r80 $80 +#define r81 $81 +#define r82 $82 +#define r83 $83 +#define r84 $84 +#define r85 $85 +#define r86 $86 +#define r87 $87 +#define r88 $88 +#define r89 $89 +#define r90 $90 +#define r91 $91 +#define r92 $92 +#define r93 $93 +#define r94 $94 +#define r95 $95 +#define r96 $96 +#define r97 $97 +#define r98 $98 +#define r99 $99 +#define r100 $100 +#define r101 $101 +#define r102 $102 +#define r103 $103 +#define r104 $104 +#define r105 $105 +#define r106 $106 +#define r107 $107 +#define r108 $108 +#define r109 $109 +#define r110 $110 +#define r111 $111 +#define r112 $112 +#define r113 $113 +#define r114 $114 +#define r115 $115 +#define r116 $116 +#define r117 $117 +#define r118 $118 +#define r119 $119 +#define r120 $120 +#define r121 $121 +#define r122 $122 +#define r123 $123 +#define r124 $124 +#define r125 $125 +#define r126 $126 +#define r127 $127 + + +#define lr r0 // link register +#define sp r1 // stack pointer + // r2 is environment pointer for langs that need it (ALGOL) + +#define retval r3 // return values are passed in regs starting at r3 + +#define arg1 r3 // args are passed in regs starting at r3 +#define arg2 r4 +#define arg3 r5 +#define arg4 r6 +#define arg5 r7 +#define arg6 r8 +#define arg7 r9 +#define arg8 r10 +#define arg9 r11 +#define arg10 r12 + +// r3 - r74 are volatile (caller saves) +// r74 - r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog) +// r80 - r127 are non-volatile (caller-saves) + +// scratch registers reserved for use by the macros in this file. + +#define _gc_t0 r79 +#define _gc_t1 r78 +#define _gc_t2 r77 + +/* + * ---------------------------------------------------------------- + * pseudo ops + * ---------------------------------------------------------------- + */ +#define PROC_ENTRY(name) \ + .text; \ + .p2align 4; \ + .global name; \ + .type name, @function; \ +name: + +/* + * ---------------------------------------------------------------- + * aliases for common operations + * ---------------------------------------------------------------- + */ + +// Move register (even pipe, 2 cycles) +#define MR(rt, ra) or rt, ra, ra; + +// Move register (odd pipe, 4 cycles) +#define LMR(rt, ra) rotqbyi rt, ra, 0; + +// return +#define RETURN() bi lr; + +// hint for a return +#define HINT_RETURN(ret_label) hbr ret_label, lr; + +// return if zero +#define BRZ_RETURN(rt) biz rt, lr; + +// return if not zero +#define BRNZ_RETURN(rt) binz rt, lr; + +// return if halfword zero +#define BRHZ_RETURN(rt) bihz rt, lr; + +// return if halfword not zero +#define BRHNZ_RETURN(rt) bihnz rt, lr; + + +/* + * ---------------------------------------------------------------- + * modulo like things for constant moduli that are powers of 2 + * ---------------------------------------------------------------- + */ + +// rt = ra & (pow2 - 1) +#define MODULO(rt, ra, pow2) \ + andi rt, ra, (pow2)-1; + +// rt = pow2 - (ra & (pow2 - 1)) +#define MODULO_NEG(rt, ra, pow2) \ + andi rt, ra, (pow2)-1; \ + sfi rt, rt, (pow2); + +// rt = ra & -(pow2) +#define ROUND_DOWN(rt, ra, pow2) \ + andi rt, ra, -(pow2); + +// rt = (ra + (pow2 - 1)) & -(pow2) +#define ROUND_UP(rt, ra, pow2) \ + ai rt, ra, (pow2)-1; \ + andi rt, rt, -(pow2); + +/* + * ---------------------------------------------------------------- + * Splat - replicate a particular slot into all slots + * Altivec analogs... + * ---------------------------------------------------------------- + */ + +// replicate byte from slot s [0,15] +#define VSPLTB(rt, ra, s) \ + ilh _gc_t0, (s)*0x0101; \ + shufb rt, ra, ra, _gc_t0; + +// replicate halfword from slot s [0,7] +#define VSPLTH(rt, ra, s) \ + ilh _gc_t0, 2*(s)*0x0101 + 0x0001; \ + shufb rt, ra, ra, _gc_t0; + +// replicate word from slot s [0,3] +#define VSPLTW(rt, ra, s) \ + iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \ + iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \ + shufb rt, ra, ra, _gc_t0; + +// replicate double from slot s [0,1] +#define VSPLTD(rt, ra, s) \ + /* sp is always 16-byte aligned */ \ + cdd _gc_t0, 8(sp); /* 0x10111213 14151617 00010203 04050607 */ \ + rotqbyi rt, ra, ra, (s) << 3; /* rotate double into preferred slot */ \ + shufb rt, rt, rt, _gc_t0; + +/* + * ---------------------------------------------------------------- + * lots of min/max variations... + * + * On a slot by slot basis, compute the min or max + * + * U - unsigned, else signed + * B,H,{} - byte, halfword, word + * F float + * ---------------------------------------------------------------- + */ + +#define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc; +#define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc; + + // words + +#define MIN(rt, ra, rb) \ + cgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAX(rt, ra, rb) \ + cgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMIN(rt, ra, rb) \ + clgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAX(rt, ra, rb) \ + clgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // bytes + +#define MINB(rt, ra, rb) \ + cgtb _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAXB(rt, ra, rb) \ + cgtb _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMINB(rt, ra, rb) \ + clgtb _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAXB(rt, ra, rb) \ + clgtb _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // halfwords + +#define MINH(rt, ra, rb) \ + cgth _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAXH(rt, ra, rb) \ + cgth _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMINH(rt, ra, rb) \ + clgth _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAXH(rt, ra, rb) \ + clgth _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // floats + +#define FMIN(rt, ra, rb) \ + fcgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define FMAX(rt, ra, rb) \ + fcgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +// Ignoring the sign, select the values with the minimum magnitude +#define FMINMAG(rt, ra, rb) \ + fcmgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +// Ignoring the sign, select the values with the maximum magnitude +#define FMAXMAG(rt, ra, rb) \ + fcmgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + +#endif /* INCLUDED_GC_SPU_MACS_H */ diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c new file mode 100644 index 000000000..765cacd9a --- /dev/null +++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c @@ -0,0 +1,160 @@ +#include + +void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src_past; + qword src_present; + qword tgt_past; + qword tgt_present; + + qword in_temp; + qword out_temp0; + qword out_temp1; + + src_past = si_lqd((qword)address_counter_src, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + vector signed short vec_val = spu_splats(val); + vector unsigned short compare; + vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1}; + vector unsigned short after_and; + + for(i = 0; i < num_bytes/16; ++i) { + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); + + compare = spu_cmpgt((vector signed short) in_temp, vec_val); + after_and = spu_and(compare, ones); + + + out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src_past = src_present; + address_counter_src = spu_add(address_counter_src, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); + + compare = spu_cmpgt((vector signed short) in_temp, vec_val); + after_and = spu_and(compare, ones); + + + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + signed short pooh[48]; + signed short bear[48]; + + int i = 0; + for(i = 0; i < 48; i += 2){ + bear[i] = i; + bear[i + 1] = -i; + } + + vector_gt_16bit(&pooh[0],&bear[0], 0, 48 * sizeof(signed short)); + + for(i = 0; i < 48; ++i) { + printf("%d, ", pooh[i]); + } + printf("\n"); +} +*/ + diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c new file mode 100644 index 000000000..a3ce6c2fe --- /dev/null +++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c @@ -0,0 +1,178 @@ +#include + +void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; + vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); + + //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 + src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + sixteen_uchar = spu_splats((unsigned char)16); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); + + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src0_past; + qword src0_present; + qword src1_past; + qword src1_present; + qword tgt_past; + qword tgt_present; + + qword in_temp0; + qword in_temp1; + qword out_temp0; + qword out_temp1; + + vector signed short sum; + + src0_past = si_lqd((qword)address_counter_src0, 0); + src1_past = si_lqd((qword)address_counter_src1, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); + + sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1); + + + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src0_past = src0_present; + src1_past = src1_present; + address_counter_src0 = spu_add(address_counter_src0, 16); + address_counter_src1 = spu_add(address_counter_src1, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); + sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + signed short pooh[48]; + signed short bear[48]; + signed short res[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = i; + } + for(i = 48; i < 96; ++i){ + bear[i - 48] = i; + } + + vector_subtract_16bit(res, &pooh[0], &bear[0], 48 * sizeof(signed short)); + + for(i = 0; i < 48; ++i) { + printf("%d, ", res[i]); + } + printf("\n"); +} +*/ + diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c new file mode 100644 index 000000000..5a1cb9aaf --- /dev/null +++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c @@ -0,0 +1,178 @@ +#include + +void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; + vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); + + //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 + src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + sixteen_uchar = spu_splats((unsigned char)16); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); + + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src0_past; + qword src0_present; + qword src1_past; + qword src1_present; + qword tgt_past; + qword tgt_present; + + qword in_temp0; + qword in_temp1; + qword out_temp0; + qword out_temp1; + + vector signed int sum; + + src0_past = si_lqd((qword)address_counter_src0, 0); + src1_past = si_lqd((qword)address_counter_src1, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); + + sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1); + + + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src0_past = src0_present; + src1_past = src1_present; + address_counter_src0 = spu_add(address_counter_src0, 16); + address_counter_src1 = spu_add(address_counter_src1, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); + sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + signed short pooh[48]; + signed short bear[48]; + signed short res[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = i; + } + for(i = 48; i < 96; ++i){ + bear[i - 48] = i; + } + + vector_sum(&pooh[9], &pooh[9], &bear[3], 30); + + for(i = 0; i < 48; ++i) { + printf("%d, ", pooh[i]); + } + printf("\n"); +} +*/ + diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c new file mode 100644 index 000000000..58fd4aa0c --- /dev/null +++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c @@ -0,0 +1,222 @@ +#include + + + + +void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, void* src1, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0}; + vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src0%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16); + + //eta: second half of the second, first half of the first, break at (unsigned int)src1%16 + src_cmp = spu_splats((unsigned char)((unsigned int)src1%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + sixteen_uchar = spu_splats((unsigned char)16); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16); + + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src0_past; + qword src0_present; + qword src1_past; + qword src1_present; + qword tgt_past; + qword tgt_present; + + qword in_temp0; + qword in_temp1; + qword out_temp0; + qword out_temp1; + + + src0_past = si_lqd((qword)address_counter_src0, 0); + src1_past = si_lqd((qword)address_counter_src1, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b}; + vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, + 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b}; + vector unsigned char shuffle_mask_complexprod2 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f}; + vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00}; + + vector float prod0; + qword shuf0; + vector float prod1; + vector float sign_change; + qword summand0; + qword summand1; + vector float sum; + + + for(i = 0; i < num_bytes/16; ++i) { + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta); + + prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); + shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); + prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0); + sign_change = spu_xor(prod0, (vector float)sign_changer); + + summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); + + summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); + + sum = spu_add((vector float)summand0, (vector float)summand1); + + + out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src0_past = src0_present; + src1_past = src1_present; + address_counter_src0 = spu_add(address_counter_src0, 16); + address_counter_src1 = spu_add(address_counter_src1, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src0_present = si_lqd((qword)address_counter_src0, 16); + src1_present = si_lqd((qword)address_counter_src1, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma); + in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta); + + + prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1); + shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0); + prod1 = spu_mul(prod0, (vector float)shuf0); + sign_change = spu_xor(prod0, (vector float)sign_changer); + summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1); + summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2); + sum = spu_add((vector float)summand0, (vector float)summand1); + + + + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +int main(){ + + float pooh[48]; + float bear[48]; + float res[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = (float) i; + } + for(i = 48; i < 96; ++i){ + bear[i - 48] = (float) i; + } + + vector_product_complex(res, pooh, bear, 48*sizeof(float)); + + + + for(i = 0; i < 48; ++i) { + printf("%f, ", res[i]); + } + printf("\n"); + + +} +*/ + diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c new file mode 100644 index 000000000..2a0dabcd7 --- /dev/null +++ b/volk/spu_lib/spu_memcpy_unaligned.c @@ -0,0 +1,290 @@ +#include + +void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes){ + //loop iterator i + int i = 0; + void* retval = target; + + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at (unsigned int)src%16 + vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16); + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16)); + + //alpha: first half of first, second half of second, break at (unsigned int)target%16 + src_cmp = spu_splats((unsigned char)((unsigned int)target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at (unsigned int)target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + + + + + qword src_past; + qword src_present; + qword tgt_past; + qword tgt_present; + + qword in_temp; + qword out_temp0; + qword out_temp1; + + src_past = si_lqd((qword)address_counter_src, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); + + out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src_past = src_present; + address_counter_src = spu_add(address_counter_src, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; +} + + + +/* +void* mcpy(void* target, void* src, size_t num_bytes){ + //loop iterator i + int i = 0; + void* retval = src; + + //put the target and source addresses into qwords + vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0}; + vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0}; + + //create shuffle masks + + //shuffle mask building blocks: + //all from the first vector + vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + //all from the second vector + vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + + + + //gamma: second half of the second, first half of the first, break at src%16 + vector unsigned char src_cmp = spu_splats((unsigned char)(src%16)); + vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp); + vector unsigned char eq_res = spu_cmpeq(oneup, src_cmp); + vector unsigned char cmp_res = spu_or(gt_res, eq_res); + vector unsigned char sixteen_uchar = spu_splats((unsigned char)16); + vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16); + + + + + vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16)); + vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16)); + + //alpha: first half of first, second half of second, break at target%16 + src_cmp = spu_splats((unsigned char)(target%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + //delta: first half of first, first half of second, break at target%16 + vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha); + //epsilon: second half of second, second half of first, break at target%16 + vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha); + //zeta: second half of second, first half of first, break at 16 - target%16 + vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16); + + //beta: first half of first, second half of second, break at num_bytes%16 + src_cmp = spu_splats((unsigned char)(num_bytes%16)); + gt_res = spu_cmpgt(oneup, src_cmp); + eq_res = spu_cmpeq(oneup, src_cmp); + cmp_res = spu_or(gt_res, eq_res); + phase_change = spu_and(sixteen_uchar, cmp_res); + vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change, + (vector unsigned int)oneup); + + + printf("num_bytesmod16 %d\n", num_bytes%16); + printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n", + spu_extract((vector unsigned char) shuffle_mask_beta, 0), + spu_extract((vector unsigned char) shuffle_mask_beta, 1), + spu_extract((vector unsigned char) shuffle_mask_beta, 2), + spu_extract((vector unsigned char) shuffle_mask_beta, 3), + spu_extract((vector unsigned char) shuffle_mask_beta, 4), + spu_extract((vector unsigned char) shuffle_mask_beta, 5), + spu_extract((vector unsigned char) shuffle_mask_beta, 6), + spu_extract((vector unsigned char) shuffle_mask_beta, 7), + spu_extract((vector unsigned char) shuffle_mask_beta, 8), + spu_extract((vector unsigned char) shuffle_mask_beta, 9), + spu_extract((vector unsigned char) shuffle_mask_beta, 10), + spu_extract((vector unsigned char) shuffle_mask_beta, 11), + spu_extract((vector unsigned char) shuffle_mask_beta, 12), + spu_extract((vector unsigned char) shuffle_mask_beta, 13), + spu_extract((vector unsigned char) shuffle_mask_beta, 14), + spu_extract((vector unsigned char) shuffle_mask_beta, 15)); + + + + + + + + qword src_past; + qword src_present; + qword tgt_past; + qword tgt_present; + + qword in_temp; + qword out_temp0; + qword out_temp1; + + src_past = si_lqd((qword)address_counter_src, 0); + tgt_past = si_lqd((qword)address_counter_tgt, 0); + + for(i = 0; i < num_bytes/16; ++i) { + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma); + + out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + tgt_past = out_temp1; + src_past = src_present; + address_counter_src = spu_add(address_counter_src, 16); + address_counter_tgt = spu_add(address_counter_tgt, 16); + + + } + + src_present = si_lqd((qword)address_counter_src, 16); + tgt_present = si_lqd((qword)address_counter_tgt, 16); + + + in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma); + qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta); + qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta); + + + + out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta); + out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon); + + si_stqd(out_temp0, (qword)address_counter_tgt, 0); + si_stqd(out_temp1, (qword)address_counter_tgt, 16); + + return retval; + +} +*/ +/* +int main(){ + + unsigned char pooh[48]; + unsigned char bear[48]; + + int i = 0; + for(i = 0; i < 48; ++i){ + pooh[i] = i; + bear[i] = i; + } + + spu_mcpy(&pooh[9],&bear[3], 15); + + for(i = 0; i < 48; ++i) { + printf("%d, ", pooh[i]); + } + printf("\n"); +} + +*/ diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S new file mode 100644 index 000000000..a655c4c52 --- /dev/null +++ b/volk/spu_lib/spu_memset_unaligned.S @@ -0,0 +1,185 @@ +/* -*- asm -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "gc_spu_macs.h" + + .file "spu_memset_unaligned.S" + + /* + * Computes this, only a lot faster... + * + * void * + * libvector_memset_unaligned(void *pv, int c, size_t n) + * { + * unsigned char *p = (unsigned char *) pv; + * size_t i; + * for (i = 0; i < n; i++) + * p[i] = c; + * + * return pv; + * } + */ + +#define p_arg arg1 // we're going to clobber arg1 w/ the return value +#define c arg2 // the constant we're writing +#define n arg3 // how many bytes to write + +#define p r13 // where we're writing +#define t0 r14 +#define t1 r15 +#define mask r16 +#define old r17 +#define an r18 // aligned n (n rounded down to mod 16 boundary) +#define next_p r19 +#define cond1 r20 +#define cond2 r21 +#define m r22 +#define r r23 + + PROC_ENTRY(libvector_memset_unaligned) + + // Hint the return from do_head, in case we go that way. + // There's pretty much nothing to can do to hint the branch to it. + hbrr do_head_br, head_complete + + MR(p, p_arg) // leaves p, the return value, in the correct reg (r3) + BRZ_RETURN(n) + + MODULO(t0, p, 16) // is p%16 == 0? + VSPLTB(c, c, 3) // splat byte in preferred slot of c into all slots + brnz t0, do_head // no, handle it +head_complete: + + /* + * preconditions: + * p%16 == 0, n > 0 + */ + hbrr middle_loop_br, middle_loop + + ROUND_DOWN(an, n, 16) // an is "aligned n" + MODULO(n, n, 16) // what's left over in the last quad + brz an, do_tail // no whole quad words; skip to tail + clgti t0, an, 127 // an >= 128? + brz t0, middle2 // nope, go handle the cases between 0 and 112 + + /* + * 128 bytes / iteration + */ + .p2align 4 +middle_loop: + ai an, an, -128 + stqd c, 0*16(p) + ai next_p, p, 128 + stqd c, 1*16(p) + cgti cond1, an, 127 + stqd c, 2*16(p) + + stqd c, 3*16(p) + stqd c, 4*16(p) + stqd c, 5*16(p) + stqd c, 6*16(p) + + MR(p, next_p) + stqd c, 7*16-128(next_p) + or cond2, n, an +middle_loop_br: + brnz cond1, middle_loop + + /* + * if an and n are both zero, return now + */ + BRZ_RETURN(cond2) + + /* + * otherwise handle last of full quad words + * + * 0 <= an < 128, p%16 == 0 + */ +middle2: + /* + * if an == 0, go handle the final non-full quadword + */ + brz an, do_tail + hbrr middle2_loop_br, middle2_loop + + .p2align 3 +middle2_loop: + ai next_p, p, 16 + stqd c, 0(p) + ai an, an, -16 + LMR(p, next_p) +middle2_loop_br: + brnz an, middle2_loop + + /* We're done with the full quadwords. */ + + /* + * Handle the final partial quadword. + * We'll be modifying only the left hand portion of the quad. + * + * preconditions: + * an == 0, 0 <= n < 16, p%16 == 0 + */ +do_tail: + HINT_RETURN(do_tail_ret) + il mask, -1 + sfi t1, n, 16 // t1 = 16 - n + lqd old, 0(p) + shlqby mask, mask, t1 + selb t0, old, c, mask + stqd t0, 0(p) +do_tail_ret: + RETURN() + + /* + * ---------------------------------------------------------------- + * Handle the first partial quadword + * + * preconditions: + * p%16 != 0 + * + * postconditions: + * p%16 == 0 or n == 0 + * + * |-- m --| + * +----------------+----------------+ + * | //////// | | + * +----------------+----------------+ + * |----- r -----| + * p + * ---------------------------------------------------------------- + */ +do_head: + lqd old, 0(p) + MODULO_NEG(r, p, 16) + il mask, -1 + UMIN(m, r, n) + shlqby mask, mask, m // 1's in the top, m*8 0's in the bottom + MR(t1, p) + sf t0, m, r // t0 = r - m + a p, p, m // p += m + rotqby mask, mask, t0 // rotate 0's to the right place + sf n, m, n // n -= m + selb t0, c, old, mask // merge + stqd t0, 0(t1) + BRZ_RETURN(n) +do_head_br: + br head_complete -- cgit