summaryrefslogtreecommitdiff
path: root/volk/spu_lib/spu_memset_unaligned.S
diff options
context:
space:
mode:
authorTom Rondeau2010-12-07 18:50:28 -0500
committerTom Rondeau2010-12-07 18:50:28 -0500
commit239144659b29c0a5ecd83a34e0e57387a1060ed7 (patch)
tree3476e1c123da4696c64cc1756ddec5d971bcf9f2 /volk/spu_lib/spu_memset_unaligned.S
parente13783aeb84a2c3656c3344a8d52fa2c9ee38a00 (diff)
downloadgnuradio-239144659b29c0a5ecd83a34e0e57387a1060ed7.tar.gz
gnuradio-239144659b29c0a5ecd83a34e0e57387a1060ed7.tar.bz2
gnuradio-239144659b29c0a5ecd83a34e0e57387a1060ed7.zip
Initial checkin for VOLK - Vector-Optimized Library of Kernels. This is a new SIMD library.
It currently stands by itself under the GNU Radio tree and can be used separately. We will integrate the build process into GNU Raio and start building off of its functionality over time.
Diffstat (limited to 'volk/spu_lib/spu_memset_unaligned.S')
-rw-r--r--volk/spu_lib/spu_memset_unaligned.S185
1 files changed, 185 insertions, 0 deletions
diff --git a/volk/spu_lib/spu_memset_unaligned.S b/volk/spu_lib/spu_memset_unaligned.S
new file mode 100644
index 000000000..a655c4c52
--- /dev/null
+++ b/volk/spu_lib/spu_memset_unaligned.S
@@ -0,0 +1,185 @@
+/* -*- asm -*- */
+/*
+ * Copyright 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gc_spu_macs.h"
+
+ .file "spu_memset_unaligned.S"
+
+ /*
+ * Computes this, only a lot faster...
+ *
+ * void *
+ * libvector_memset_unaligned(void *pv, int c, size_t n)
+ * {
+ * unsigned char *p = (unsigned char *) pv;
+ * size_t i;
+ * for (i = 0; i < n; i++)
+ * p[i] = c;
+ *
+ * return pv;
+ * }
+ */
+
+#define p_arg arg1 // we're going to clobber arg1 w/ the return value
+#define c arg2 // the constant we're writing
+#define n arg3 // how many bytes to write
+
+#define p r13 // where we're writing
+#define t0 r14
+#define t1 r15
+#define mask r16
+#define old r17
+#define an r18 // aligned n (n rounded down to mod 16 boundary)
+#define next_p r19
+#define cond1 r20
+#define cond2 r21
+#define m r22
+#define r r23
+
+ PROC_ENTRY(libvector_memset_unaligned)
+
+ // Hint the return from do_head, in case we go that way.
+ // There's pretty much nothing to can do to hint the branch to it.
+ hbrr do_head_br, head_complete
+
+ MR(p, p_arg) // leaves p, the return value, in the correct reg (r3)
+ BRZ_RETURN(n)
+
+ MODULO(t0, p, 16) // is p%16 == 0?
+ VSPLTB(c, c, 3) // splat byte in preferred slot of c into all slots
+ brnz t0, do_head // no, handle it
+head_complete:
+
+ /*
+ * preconditions:
+ * p%16 == 0, n > 0
+ */
+ hbrr middle_loop_br, middle_loop
+
+ ROUND_DOWN(an, n, 16) // an is "aligned n"
+ MODULO(n, n, 16) // what's left over in the last quad
+ brz an, do_tail // no whole quad words; skip to tail
+ clgti t0, an, 127 // an >= 128?
+ brz t0, middle2 // nope, go handle the cases between 0 and 112
+
+ /*
+ * 128 bytes / iteration
+ */
+ .p2align 4
+middle_loop:
+ ai an, an, -128
+ stqd c, 0*16(p)
+ ai next_p, p, 128
+ stqd c, 1*16(p)
+ cgti cond1, an, 127
+ stqd c, 2*16(p)
+
+ stqd c, 3*16(p)
+ stqd c, 4*16(p)
+ stqd c, 5*16(p)
+ stqd c, 6*16(p)
+
+ MR(p, next_p)
+ stqd c, 7*16-128(next_p)
+ or cond2, n, an
+middle_loop_br:
+ brnz cond1, middle_loop
+
+ /*
+ * if an and n are both zero, return now
+ */
+ BRZ_RETURN(cond2)
+
+ /*
+ * otherwise handle last of full quad words
+ *
+ * 0 <= an < 128, p%16 == 0
+ */
+middle2:
+ /*
+ * if an == 0, go handle the final non-full quadword
+ */
+ brz an, do_tail
+ hbrr middle2_loop_br, middle2_loop
+
+ .p2align 3
+middle2_loop:
+ ai next_p, p, 16
+ stqd c, 0(p)
+ ai an, an, -16
+ LMR(p, next_p)
+middle2_loop_br:
+ brnz an, middle2_loop
+
+ /* We're done with the full quadwords. */
+
+ /*
+ * Handle the final partial quadword.
+ * We'll be modifying only the left hand portion of the quad.
+ *
+ * preconditions:
+ * an == 0, 0 <= n < 16, p%16 == 0
+ */
+do_tail:
+ HINT_RETURN(do_tail_ret)
+ il mask, -1
+ sfi t1, n, 16 // t1 = 16 - n
+ lqd old, 0(p)
+ shlqby mask, mask, t1
+ selb t0, old, c, mask
+ stqd t0, 0(p)
+do_tail_ret:
+ RETURN()
+
+ /*
+ * ----------------------------------------------------------------
+ * Handle the first partial quadword
+ *
+ * preconditions:
+ * p%16 != 0
+ *
+ * postconditions:
+ * p%16 == 0 or n == 0
+ *
+ * |-- m --|
+ * +----------------+----------------+
+ * | //////// | |
+ * +----------------+----------------+
+ * |----- r -----|
+ * p
+ * ----------------------------------------------------------------
+ */
+do_head:
+ lqd old, 0(p)
+ MODULO_NEG(r, p, 16)
+ il mask, -1
+ UMIN(m, r, n)
+ shlqby mask, mask, m // 1's in the top, m*8 0's in the bottom
+ MR(t1, p)
+ sf t0, m, r // t0 = r - m
+ a p, p, m // p += m
+ rotqby mask, mask, t0 // rotate 0's to the right place
+ sf n, m, n // n -= m
+ selb t0, c, old, mask // merge
+ stqd t0, 0(t1)
+ BRZ_RETURN(n)
+do_head_br:
+ br head_complete