summaryrefslogtreecommitdiff
path: root/volk/spu_lib
diff options
context:
space:
mode:
Diffstat (limited to 'volk/spu_lib')
-rw-r--r--volk/spu_lib/gc_spu_macs.h34
-rw-r--r--volk/spu_lib/spu_16s_cmpgt_unaligned.c66
-rw-r--r--volk/spu_lib/spu_16s_vector_subtract_unaligned.c68
-rw-r--r--volk/spu_lib/spu_16s_vector_sum_unaligned.c68
-rw-r--r--volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c94
-rw-r--r--volk/spu_lib/spu_memcpy_unaligned.c122
-rw-r--r--volk/spu_lib/spu_memset_unaligned.S44
7 files changed, 248 insertions, 248 deletions
diff --git a/volk/spu_lib/gc_spu_macs.h b/volk/spu_lib/gc_spu_macs.h
index 8e3e3f2a6..e86dce3f5 100644
--- a/volk/spu_lib/gc_spu_macs.h
+++ b/volk/spu_lib/gc_spu_macs.h
@@ -1,19 +1,19 @@
/* -*- asm -*- */
/*
* Copyright 2008 Free Software Foundation, Inc.
- *
+ *
* This file is part of GNU Radio
- *
+ *
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
- *
+ *
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
@@ -279,7 +279,7 @@ name:
iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \
iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \
shufb rt, ra, ra, _gc_t0;
-
+
// replicate double from slot s [0,1]
#define VSPLTD(rt, ra, s) \
/* sp is always 16-byte aligned */ \
@@ -301,13 +301,13 @@ name:
#define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc;
#define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc;
-
+
// words
#define MIN(rt, ra, rb) \
cgt _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define MAX(rt, ra, rb) \
cgt _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
@@ -315,17 +315,17 @@ name:
#define UMIN(rt, ra, rb) \
clgt _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define UMAX(rt, ra, rb) \
clgt _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
// bytes
-
+
#define MINB(rt, ra, rb) \
cgtb _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define MAXB(rt, ra, rb) \
cgtb _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
@@ -333,17 +333,17 @@ name:
#define UMINB(rt, ra, rb) \
clgtb _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define UMAXB(rt, ra, rb) \
clgtb _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
// halfwords
-
+
#define MINH(rt, ra, rb) \
cgth _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define MAXH(rt, ra, rb) \
cgth _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
@@ -351,17 +351,17 @@ name:
#define UMINH(rt, ra, rb) \
clgth _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define UMAXH(rt, ra, rb) \
clgth _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
// floats
-
+
#define FMIN(rt, ra, rb) \
fcgt _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
#define FMAX(rt, ra, rb) \
fcgt _gc_t0, ra, rb; \
MAX_SELB(rt, ra, rb, _gc_t0)
@@ -370,7 +370,7 @@ name:
#define FMINMAG(rt, ra, rb) \
fcmgt _gc_t0, ra, rb; \
MIN_SELB(rt, ra, rb, _gc_t0)
-
+
// Ignoring the sign, select the values with the maximum magnitude
#define FMAXMAG(rt, ra, rb) \
fcmgt _gc_t0, ra, rb; \
diff --git a/volk/spu_lib/spu_16s_cmpgt_unaligned.c b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
index 765cacd9a..8811e6801 100644
--- a/volk/spu_lib/spu_16s_cmpgt_unaligned.c
+++ b/volk/spu_lib/spu_16s_cmpgt_unaligned.c
@@ -4,14 +4,14 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u
//loop iterator i
int i = 0;
void* retval = target;
-
+
//put the target and source addresses into qwords
vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
-
+
//create shuffle masks
-
+
//shuffle mask building blocks:
//all from the first vector
vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -19,9 +19,9 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u
//all from the second vector
vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
-
-
+
+
//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
@@ -29,16 +29,16 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u
vector unsigned char cmp_res = spu_or(gt_res, eq_res);
vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
-
-
+
+
vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
-
+
//alpha: first half of first, second half of second, break at (unsigned int)target%16
src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
gt_res = spu_cmpgt(oneup, src_cmp);
@@ -47,13 +47,13 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
+
//delta: first half of first, first half of second, break at (unsigned int)target%16
vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
//epsilon: second half of second, second half of first, break at (unsigned int)target%16
vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
- vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
//beta: first half of first, second half of second, break at num_bytes%16
src_cmp = spu_splats((unsigned char)(num_bytes%16));
@@ -63,17 +63,17 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
-
-
-
+
+
+
+
qword src_past;
qword src_present;
qword tgt_past;
qword tgt_present;
-
+
qword in_temp;
qword out_temp0;
qword out_temp1;
@@ -85,53 +85,53 @@ void* libvector_16s_cmpgt_unaligned(void* target, void* src, signed short val, u
vector unsigned short compare;
vector unsigned short ones = {1, 1, 1, 1, 1, 1, 1, 1};
vector unsigned short after_and;
-
+
for(i = 0; i < num_bytes/16; ++i) {
-
+
src_present = si_lqd((qword)address_counter_src, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
+
in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
compare = spu_cmpgt((vector signed short) in_temp, vec_val);
after_and = spu_and(compare, ones);
-
-
+
+
out_temp0 = spu_shuffle(tgt_past, (qword)after_and, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, (qword)after_and, shuffle_mask_epsilon);
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
tgt_past = out_temp1;
src_past = src_present;
address_counter_src = spu_add(address_counter_src, 16);
address_counter_tgt = spu_add(address_counter_tgt, 16);
-
+
}
-
+
src_present = si_lqd((qword)address_counter_src, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
-
+
+
in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
-
+
compare = spu_cmpgt((vector signed short) in_temp, vec_val);
after_and = spu_and(compare, ones);
-
+
qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
qword meld = spu_shuffle((qword)after_and, target_temp, (vector unsigned char)shuffle_mask_beta);
-
-
+
+
out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
return retval;
}
@@ -156,5 +156,5 @@ int main(){
}
printf("\n");
}
-*/
+*/
diff --git a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
index a3ce6c2fe..ea110c8d2 100644
--- a/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
+++ b/volk/spu_lib/spu_16s_vector_subtract_unaligned.c
@@ -4,15 +4,15 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
//loop iterator i
int i = 0;
void* retval = target;
-
+
//put the target and source addresses into qwords
vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
-
+
//create shuffle masks
-
+
//shuffle mask building blocks:
//all from the first vector
vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -20,9 +20,9 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
//all from the second vector
vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
-
-
+
+
//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
@@ -30,7 +30,7 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
vector unsigned char cmp_res = spu_or(gt_res, eq_res);
vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
@@ -41,17 +41,17 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
cmp_res = spu_or(gt_res, eq_res);
sixteen_uchar = spu_splats((unsigned char)16);
phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
-
-
-
+
+
+
vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
-
+
//alpha: first half of first, second half of second, break at (unsigned int)target%16
src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
gt_res = spu_cmpgt(oneup, src_cmp);
@@ -60,13 +60,13 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
+
//delta: first half of first, first half of second, break at (unsigned int)target%16
vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
//epsilon: second half of second, second half of first, break at (unsigned int)target%16
vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
- vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
//beta: first half of first, second half of second, break at num_bytes%16
src_cmp = spu_splats((unsigned char)(num_bytes%16));
@@ -76,19 +76,19 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
-
-
-
+
+
+
+
qword src0_past;
qword src0_present;
qword src1_past;
qword src1_present;
qword tgt_past;
qword tgt_present;
-
+
qword in_temp0;
qword in_temp1;
qword out_temp0;
@@ -99,54 +99,54 @@ void* libvector_16s_vector_subtract_unaligned(void* target, void* src0, void* s
src0_past = si_lqd((qword)address_counter_src0, 0);
src1_past = si_lqd((qword)address_counter_src1, 0);
tgt_past = si_lqd((qword)address_counter_tgt, 0);
-
+
for(i = 0; i < num_bytes/16; ++i) {
-
+
src0_present = si_lqd((qword)address_counter_src0, 16);
src1_present = si_lqd((qword)address_counter_src1, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
+
in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
-
+
sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
-
+
out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
tgt_past = out_temp1;
src0_past = src0_present;
src1_past = src1_present;
address_counter_src0 = spu_add(address_counter_src0, 16);
address_counter_src1 = spu_add(address_counter_src1, 16);
address_counter_tgt = spu_add(address_counter_tgt, 16);
-
-
+
+
}
-
+
src0_present = si_lqd((qword)address_counter_src0, 16);
src1_present = si_lqd((qword)address_counter_src1, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
-
+
+
in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
sum = spu_sub((vector signed short)in_temp0, (vector signed short)in_temp1);
qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
-
-
+
+
out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
return retval;
}
diff --git a/volk/spu_lib/spu_16s_vector_sum_unaligned.c b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
index 5a1cb9aaf..0097b4f56 100644
--- a/volk/spu_lib/spu_16s_vector_sum_unaligned.c
+++ b/volk/spu_lib/spu_16s_vector_sum_unaligned.c
@@ -4,15 +4,15 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
//loop iterator i
int i = 0;
void* retval = target;
-
+
//put the target and source addresses into qwords
vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
-
+
//create shuffle masks
-
+
//shuffle mask building blocks:
//all from the first vector
vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -20,9 +20,9 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
//all from the second vector
vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
-
-
+
+
//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
@@ -30,7 +30,7 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
vector unsigned char cmp_res = spu_or(gt_res, eq_res);
vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
@@ -41,17 +41,17 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
cmp_res = spu_or(gt_res, eq_res);
sixteen_uchar = spu_splats((unsigned char)16);
phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
-
-
-
+
+
+
vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
-
+
//alpha: first half of first, second half of second, break at (unsigned int)target%16
src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
gt_res = spu_cmpgt(oneup, src_cmp);
@@ -60,13 +60,13 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
+
//delta: first half of first, first half of second, break at (unsigned int)target%16
vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
//epsilon: second half of second, second half of first, break at (unsigned int)target%16
vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
- vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
//beta: first half of first, second half of second, break at num_bytes%16
src_cmp = spu_splats((unsigned char)(num_bytes%16));
@@ -76,19 +76,19 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
-
-
-
+
+
+
+
qword src0_past;
qword src0_present;
qword src1_past;
qword src1_present;
qword tgt_past;
qword tgt_present;
-
+
qword in_temp0;
qword in_temp1;
qword out_temp0;
@@ -99,54 +99,54 @@ void* libvector_16s_vector_sum_unaligned(void* target, void* src0, void* src1,
src0_past = si_lqd((qword)address_counter_src0, 0);
src1_past = si_lqd((qword)address_counter_src1, 0);
tgt_past = si_lqd((qword)address_counter_tgt, 0);
-
+
for(i = 0; i < num_bytes/16; ++i) {
-
+
src0_present = si_lqd((qword)address_counter_src0, 16);
src1_present = si_lqd((qword)address_counter_src1, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
+
in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
-
+
sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
-
+
out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
tgt_past = out_temp1;
src0_past = src0_present;
src1_past = src1_present;
address_counter_src0 = spu_add(address_counter_src0, 16);
address_counter_src1 = spu_add(address_counter_src1, 16);
address_counter_tgt = spu_add(address_counter_tgt, 16);
-
-
+
+
}
-
+
src0_present = si_lqd((qword)address_counter_src0, 16);
src1_present = si_lqd((qword)address_counter_src1, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
-
+
+
in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
sum = spu_add((vector signed int)in_temp0, (vector signed int)in_temp1);
qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
-
-
+
+
out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
return retval;
}
diff --git a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
index 58fd4aa0c..d1c960488 100644
--- a/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
+++ b/volk/spu_lib/spu_32fc_pointwise_multiply_unaligned.c
@@ -7,15 +7,15 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
//loop iterator i
int i = 0;
void* retval = target;
-
+
//put the target and source addresses into qwords
vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
vector unsigned int address_counter_src0 = {(unsigned int)src0, 0, 0 ,0};
vector unsigned int address_counter_src1 = {(unsigned int)src1, 0, 0, 0};
-
+
//create shuffle masks
-
+
//shuffle mask building blocks:
//all from the first vector
vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -23,9 +23,9 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
//all from the second vector
vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
-
-
+
+
//gamma: second half of the second, first half of the first, break at (unsigned int)src0%16
vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src0%16));
vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
@@ -33,7 +33,7 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
vector unsigned char cmp_res = spu_or(gt_res, eq_res);
vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src0%16);
@@ -44,17 +44,17 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
cmp_res = spu_or(gt_res, eq_res);
sixteen_uchar = spu_splats((unsigned char)16);
phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_eta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_eta = spu_rlqwbyte(shuffle_mask_eta, (unsigned int)src1%16);
-
-
-
+
+
+
vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
-
+
//alpha: first half of first, second half of second, break at (unsigned int)target%16
src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
gt_res = spu_cmpgt(oneup, src_cmp);
@@ -63,13 +63,13 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
+
//delta: first half of first, first half of second, break at (unsigned int)target%16
vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
//epsilon: second half of second, second half of first, break at (unsigned int)target%16
vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
- vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
//beta: first half of first, second half of second, break at num_bytes%16
src_cmp = spu_splats((unsigned char)(num_bytes%16));
@@ -79,19 +79,19 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
-
-
-
+
+
+
+
qword src0_past;
qword src0_present;
qword src1_past;
qword src1_present;
qword tgt_past;
qword tgt_present;
-
+
qword in_temp0;
qword in_temp1;
qword out_temp0;
@@ -101,7 +101,7 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
src0_past = si_lqd((qword)address_counter_src0, 0);
src1_past = si_lqd((qword)address_counter_src1, 0);
tgt_past = si_lqd((qword)address_counter_tgt, 0);
-
+
vector unsigned char shuffle_mask_complexprod0 = {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b};
vector unsigned char shuffle_mask_complexprod1 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
@@ -110,7 +110,7 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
vector unsigned char sign_changer = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
-
+
vector float prod0;
qword shuf0;
vector float prod1;
@@ -118,54 +118,54 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
qword summand0;
qword summand1;
vector float sum;
-
+
for(i = 0; i < num_bytes/16; ++i) {
-
+
src0_present = si_lqd((qword)address_counter_src0, 16);
src1_present = si_lqd((qword)address_counter_src1, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
+
in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char)shuffle_mask_gamma);
in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char)shuffle_mask_eta);
-
+
prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
prod1 = spu_mul((vector float)in_temp0, (vector float)shuf0);
sign_change = spu_xor(prod0, (vector float)sign_changer);
-
+
summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
-
+
summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
-
+
sum = spu_add((vector float)summand0, (vector float)summand1);
-
+
out_temp0 = spu_shuffle(tgt_past, (qword)sum, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, (qword)sum, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
tgt_past = out_temp1;
src0_past = src0_present;
src1_past = src1_present;
address_counter_src0 = spu_add(address_counter_src0, 16);
address_counter_src1 = spu_add(address_counter_src1, 16);
address_counter_tgt = spu_add(address_counter_tgt, 16);
-
-
+
+
}
-
+
src0_present = si_lqd((qword)address_counter_src0, 16);
src1_present = si_lqd((qword)address_counter_src1, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
-
+
+
in_temp0 = spu_shuffle(src0_present, src0_past, (vector unsigned char) shuffle_mask_gamma);
in_temp1 = spu_shuffle(src1_present, src1_past, (vector unsigned char) shuffle_mask_eta);
-
-
+
+
prod0 = spu_mul((vector float)in_temp0, (vector float)in_temp1);
shuf0 = spu_shuffle((qword)in_temp1, (qword)in_temp1, shuffle_mask_complexprod0);
prod1 = spu_mul(prod0, (vector float)shuf0);
@@ -173,20 +173,20 @@ void* libvector_pointwise_multiply_32fc_unaligned(void* target, void* src0, voi
summand0 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod1);
summand1 = spu_shuffle((qword)sign_change, (qword)prod1, shuffle_mask_complexprod2);
sum = spu_add((vector float)summand0, (vector float)summand1);
-
-
-
+
+
+
qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
qword meld = spu_shuffle((qword)sum, target_temp, (vector unsigned char)shuffle_mask_beta);
-
-
-
+
+
+
out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
return retval;
}
@@ -209,14 +209,14 @@ int main(){
vector_product_complex(res, pooh, bear, 48*sizeof(float));
-
+
for(i = 0; i < 48; ++i) {
printf("%f, ", res[i]);
}
printf("\n");
-
+
}
*/
diff --git a/volk/spu_lib/spu_memcpy_unaligned.c b/volk/spu_lib/spu_memcpy_unaligned.c
index 2a0dabcd7..0f15b5d80 100644
--- a/volk/spu_lib/spu_memcpy_unaligned.c
+++ b/volk/spu_lib/spu_memcpy_unaligned.c
@@ -5,14 +5,14 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes
//loop iterator i
int i = 0;
void* retval = target;
-
+
//put the target and source addresses into qwords
vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
-
+
//create shuffle masks
-
+
//shuffle mask building blocks:
//all from the first vector
vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -20,9 +20,9 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes
//all from the second vector
vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
-
-
+
+
//gamma: second half of the second, first half of the first, break at (unsigned int)src%16
vector unsigned char src_cmp = spu_splats((unsigned char)((unsigned int)src%16));
vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
@@ -30,16 +30,16 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes
vector unsigned char cmp_res = spu_or(gt_res, eq_res);
vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, (unsigned int)src%16);
-
-
+
+
vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -((unsigned int)target%16));
vector unsigned char tgt_first = spu_rlqwbyte(oneup, -((unsigned int)target%16));
-
+
//alpha: first half of first, second half of second, break at (unsigned int)target%16
src_cmp = spu_splats((unsigned char)((unsigned int)target%16));
gt_res = spu_cmpgt(oneup, src_cmp);
@@ -48,13 +48,13 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
+
//delta: first half of first, first half of second, break at (unsigned int)target%16
vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
//epsilon: second half of second, second half of first, break at (unsigned int)target%16
vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
//zeta: second half of second, first half of first, break at 16 - (unsigned int)target%16
- vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, (unsigned int)target%16);
//beta: first half of first, second half of second, break at num_bytes%16
src_cmp = spu_splats((unsigned char)(num_bytes%16));
@@ -64,61 +64,61 @@ void* libvector_memcpy_unaligned(void* target, void* src, unsigned int num_bytes
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
-
-
-
+
+
+
+
qword src_past;
qword src_present;
qword tgt_past;
qword tgt_present;
-
+
qword in_temp;
qword out_temp0;
qword out_temp1;
src_past = si_lqd((qword)address_counter_src, 0);
tgt_past = si_lqd((qword)address_counter_tgt, 0);
-
+
for(i = 0; i < num_bytes/16; ++i) {
-
+
src_present = si_lqd((qword)address_counter_src, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
+
in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
-
+
out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
tgt_past = out_temp1;
src_past = src_present;
address_counter_src = spu_add(address_counter_src, 16);
address_counter_tgt = spu_add(address_counter_tgt, 16);
-
+
}
-
+
src_present = si_lqd((qword)address_counter_src, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
-
+
+
in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
-
-
+
+
out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
return retval;
}
@@ -133,9 +133,9 @@ void* mcpy(void* target, void* src, size_t num_bytes){
//put the target and source addresses into qwords
vector unsigned int address_counter_tgt = {(unsigned int)target, 0, 0, 0};
vector unsigned int address_counter_src = {(unsigned int)src, 0, 0 ,0};
-
+
//create shuffle masks
-
+
//shuffle mask building blocks:
//all from the first vector
vector unsigned char oneup = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
@@ -143,9 +143,9 @@ void* mcpy(void* target, void* src, size_t num_bytes){
//all from the second vector
vector unsigned char second_oneup = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
-
-
+
+
//gamma: second half of the second, first half of the first, break at src%16
vector unsigned char src_cmp = spu_splats((unsigned char)(src%16));
vector unsigned char gt_res = spu_cmpgt(oneup, src_cmp);
@@ -153,16 +153,16 @@ void* mcpy(void* target, void* src, size_t num_bytes){
vector unsigned char cmp_res = spu_or(gt_res, eq_res);
vector unsigned char sixteen_uchar = spu_splats((unsigned char)16);
vector unsigned char phase_change = spu_and(sixteen_uchar, cmp_res);
- vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
+ vector unsigned int shuffle_mask_gamma = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
shuffle_mask_gamma = spu_rlqwbyte(shuffle_mask_gamma, src%16);
-
-
+
+
vector unsigned char tgt_second = spu_rlqwbyte(second_oneup, -(target%16));
vector unsigned char tgt_first = spu_rlqwbyte(oneup, -(target%16));
-
+
//alpha: first half of first, second half of second, break at target%16
src_cmp = spu_splats((unsigned char)(target%16));
gt_res = spu_cmpgt(oneup, src_cmp);
@@ -171,13 +171,13 @@ void* mcpy(void* target, void* src, size_t num_bytes){
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_alpha = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
+
//delta: first half of first, first half of second, break at target%16
vector unsigned char shuffle_mask_delta = spu_shuffle(oneup, tgt_second, (vector unsigned char)shuffle_mask_alpha);
//epsilon: second half of second, second half of first, break at target%16
vector unsigned char shuffle_mask_epsilon = spu_shuffle(tgt_second, oneup, (vector unsigned char)shuffle_mask_alpha);
//zeta: second half of second, first half of first, break at 16 - target%16
- vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16);
+ vector unsigned int shuffle_mask_zeta = spu_rlqwbyte(shuffle_mask_alpha, target%16);
//beta: first half of first, second half of second, break at num_bytes%16
src_cmp = spu_splats((unsigned char)(num_bytes%16));
@@ -187,10 +187,10 @@ void* mcpy(void* target, void* src, size_t num_bytes){
phase_change = spu_and(sixteen_uchar, cmp_res);
vector unsigned int shuffle_mask_beta = spu_add((vector unsigned int)phase_change,
(vector unsigned int)oneup);
-
-
+
+
printf("num_bytesmod16 %d\n", num_bytes%16);
- printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
+ printf("beta %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
spu_extract((vector unsigned char) shuffle_mask_beta, 0),
spu_extract((vector unsigned char) shuffle_mask_beta, 1),
spu_extract((vector unsigned char) shuffle_mask_beta, 2),
@@ -207,64 +207,64 @@ void* mcpy(void* target, void* src, size_t num_bytes){
spu_extract((vector unsigned char) shuffle_mask_beta, 13),
spu_extract((vector unsigned char) shuffle_mask_beta, 14),
spu_extract((vector unsigned char) shuffle_mask_beta, 15));
-
-
-
-
+
+
+
+
qword src_past;
qword src_present;
qword tgt_past;
qword tgt_present;
-
+
qword in_temp;
qword out_temp0;
qword out_temp1;
src_past = si_lqd((qword)address_counter_src, 0);
tgt_past = si_lqd((qword)address_counter_tgt, 0);
-
+
for(i = 0; i < num_bytes/16; ++i) {
-
+
src_present = si_lqd((qword)address_counter_src, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
+
in_temp = spu_shuffle(src_present, src_past, (vector unsigned char)shuffle_mask_gamma);
-
+
out_temp0 = spu_shuffle(tgt_past, in_temp, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, in_temp, shuffle_mask_epsilon);
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
-
+
tgt_past = out_temp1;
src_past = src_present;
address_counter_src = spu_add(address_counter_src, 16);
address_counter_tgt = spu_add(address_counter_tgt, 16);
-
+
}
-
+
src_present = si_lqd((qword)address_counter_src, 16);
tgt_present = si_lqd((qword)address_counter_tgt, 16);
-
-
+
+
in_temp = spu_shuffle(src_present, src_past,(vector unsigned char) shuffle_mask_gamma);
qword target_temp = spu_shuffle(tgt_present, tgt_past, (vector unsigned char) shuffle_mask_zeta);
qword meld = spu_shuffle(in_temp, target_temp, (vector unsigned char)shuffle_mask_beta);
-
-
+
+
out_temp0 = spu_shuffle(tgt_past, meld, shuffle_mask_delta);
out_temp1 = spu_shuffle(tgt_present, meld, shuffle_mask_epsilon);
-
+
si_stqd(out_temp0, (qword)address_counter_tgt, 0);
si_stqd(out_temp1, (qword)address_counter_tgt, 16);
return retval;
-
+
}
*/
/*
@@ -286,5 +286,5 @@ int main(){
}
printf("\n");
}
-
+
*/
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
index a655c4c52..c260a125c 100644
--- a/volk/spu_lib/spu_memset_unaligned.S
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -1,19 +1,19 @@
/* -*- asm -*- */
/*
* Copyright 2008 Free Software Foundation, Inc.
- *
+ *
* This file is part of GNU Radio
- *
+ *
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
- *
+ *
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
@@ -33,11 +33,11 @@
* size_t i;
* for (i = 0; i < n; i++)
* p[i] = c;
- *
+ *
* return pv;
* }
*/
-
+
#define p_arg arg1 // we're going to clobber arg1 w/ the return value
#define c arg2 // the constant we're writing
#define n arg3 // how many bytes to write
@@ -50,16 +50,16 @@
#define an r18 // aligned n (n rounded down to mod 16 boundary)
#define next_p r19
#define cond1 r20
-#define cond2 r21
+#define cond2 r21
#define m r22
#define r r23
-
+
PROC_ENTRY(libvector_memset_unaligned)
-
+
// Hint the return from do_head, in case we go that way.
// There's pretty much nothing to can do to hint the branch to it.
hbrr do_head_br, head_complete
-
+
MR(p, p_arg) // leaves p, the return value, in the correct reg (r3)
BRZ_RETURN(n)
@@ -69,11 +69,11 @@
head_complete:
/*
- * preconditions:
+ * preconditions:
* p%16 == 0, n > 0
*/
hbrr middle_loop_br, middle_loop
-
+
ROUND_DOWN(an, n, 16) // an is "aligned n"
MODULO(n, n, 16) // what's left over in the last quad
brz an, do_tail // no whole quad words; skip to tail
@@ -96,20 +96,20 @@ middle_loop:
stqd c, 4*16(p)
stqd c, 5*16(p)
stqd c, 6*16(p)
-
+
MR(p, next_p)
stqd c, 7*16-128(next_p)
or cond2, n, an
middle_loop_br:
brnz cond1, middle_loop
-
+
/*
- * if an and n are both zero, return now
+ * if an and n are both zero, return now
*/
BRZ_RETURN(cond2)
/*
- * otherwise handle last of full quad words
+ * otherwise handle last of full quad words
*
* 0 <= an < 128, p%16 == 0
*/
@@ -119,18 +119,18 @@ middle2:
*/
brz an, do_tail
hbrr middle2_loop_br, middle2_loop
-
+
.p2align 3
-middle2_loop:
+middle2_loop:
ai next_p, p, 16
stqd c, 0(p)
ai an, an, -16
LMR(p, next_p)
middle2_loop_br:
brnz an, middle2_loop
-
+
/* We're done with the full quadwords. */
-
+
/*
* Handle the final partial quadword.
* We'll be modifying only the left hand portion of the quad.
@@ -146,7 +146,7 @@ do_tail:
shlqby mask, mask, t1
selb t0, old, c, mask
stqd t0, 0(p)
-do_tail_ret:
+do_tail_ret:
RETURN()
/*
@@ -176,7 +176,7 @@ do_head:
MR(t1, p)
sf t0, m, r // t0 = r - m
a p, p, m // p += m
- rotqby mask, mask, t0 // rotate 0's to the right place
+ rotqby mask, mask, t0 // rotate 0's to the right place
sf n, m, n // n -= m
selb t0, c, old, mask // merge
stqd t0, 0(t1)