volk/spu_lib/spu_memset_unaligned.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185

/* -*- asm -*- */
/*
 * Copyright 2008 Free Software Foundation, Inc.
 *
 * This file is part of GNU Radio
 *
 * GNU Radio is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3, or (at your option)
 * any later version.
 *
 * GNU Radio is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include "gc_spu_macs.h"

	.file "spu_memset_unaligned.S"

	/*
	 * Computes this, only a lot faster...
	 *
	 *	void *
	 *	libvector_memset_unaligned(void *pv, int c, size_t n)
	 *	{
	 *	  unsigned char *p = (unsigned char *) pv;
	 *	  size_t i;
	 *	  for (i = 0; i < n; i++)
	 *	    p[i] = c;
	 *
	 *	  return pv;
	 *	}
	 */

#define	p_arg	arg1	// we're going to clobber arg1 w/ the return value
#define	c	arg2	// the constant we're writing
#define	n	arg3	// how many bytes to write

#define	p	r13	// where we're writing
#define	t0	r14
#define t1	r15
#define	mask	r16
#define	old	r17
#define an	r18	// aligned n (n rounded down to mod 16 boundary)
#define	next_p	r19
#define	cond1	r20
#define	cond2	r21
#define m	r22
#define r	r23

	PROC_ENTRY(libvector_memset_unaligned)

	// Hint the return from do_head, in case we go that way.
	// There's pretty much nothing to can do to hint the branch to it.
	hbrr	do_head_br, head_complete

	MR(p, p_arg)	// leaves p, the return value, in the correct reg (r3)
	BRZ_RETURN(n)

	MODULO(t0, p, 16)	// is p%16 == 0?
	VSPLTB(c, c, 3)		// splat byte in preferred slot of c into all slots
	brnz	t0, do_head	// no, handle it
head_complete:

	/*
	 * preconditions:
	 *   p%16 == 0, n > 0
	 */
	hbrr	middle_loop_br, middle_loop

	ROUND_DOWN(an, n, 16)	// an is "aligned n"
	MODULO(n, n, 16)	// what's left over in the last quad
	brz	an, do_tail	// no whole quad words; skip to tail
	clgti	t0, an, 127	// an >= 128?
	brz	t0, middle2	// nope, go handle the cases between 0 and 112

	/*
	 * 128 bytes / iteration
	 */
	.p2align 4
middle_loop:
	ai	an, an, -128
	  stqd	c,  0*16(p)
	ai	next_p, p, 128
	  stqd	c,  1*16(p)
	cgti	cond1, an, 127
	  stqd	c,  2*16(p)

	  stqd	c,  3*16(p)
	  stqd	c,  4*16(p)
	  stqd	c,  5*16(p)
	  stqd	c,  6*16(p)

	MR(p, next_p)
	  stqd	c,  7*16-128(next_p)
	or	cond2, n, an
middle_loop_br:
	  brnz	cond1, middle_loop

	/*
	 * if an and n are both zero, return now
	 */
	BRZ_RETURN(cond2)

	/*
	 * otherwise handle last of full quad words
	 *
	 *   0 <= an < 128, p%16 == 0
	 */
middle2:
	/*
	 * if an == 0, go handle the final non-full quadword
	 */
	brz	an, do_tail
	hbrr	middle2_loop_br, middle2_loop

	.p2align 3
middle2_loop:
	ai	next_p, p, 16
	  stqd	c, 0(p)
	ai	an, an, -16
	  LMR(p, next_p)
middle2_loop_br:
	  brnz	an, middle2_loop

	/* We're done with the full quadwords. */

	/*
	 * Handle the final partial quadword.
	 * We'll be modifying only the left hand portion of the quad.
	 *
	 * preconditions:
	 *   an == 0, 0 <= n < 16, p%16 == 0
	 */
do_tail:
	HINT_RETURN(do_tail_ret)
	il	mask, -1
	sfi	t1, n, 16		// t1 = 16 - n
	lqd	old, 0(p)
	shlqby  mask, mask, t1
	selb	t0, old, c, mask
	stqd	t0, 0(p)
do_tail_ret:
	RETURN()

	/*
	 * ----------------------------------------------------------------
	 * Handle the first partial quadword
	 *
	 * preconditions:
	 *   p%16 != 0
	 *
         * postconditions:
         *   p%16 == 0 or n == 0
         *
         *        |-- m --|
         *     +----------------+----------------+
         *     |  ////////      |                |
         *     +----------------+----------------+
         *        |----- r -----|
         *        p
         * ----------------------------------------------------------------
	 */
do_head:
	lqd	old, 0(p)
	MODULO_NEG(r, p, 16)
	il	mask, -1
	UMIN(m, r, n)
	shlqby	mask, mask, m	// 1's in the top, m*8 0's in the bottom
	MR(t1, p)
	sf	t0, m, r	// t0 = r - m
	a	p, p, m		// p += m
	rotqby	mask, mask, t0	// rotate 0's to the right place
	sf	n, m, n		// n -= m
	selb	t0, c, old, mask // merge
	stqd	t0, 0(t1)
	BRZ_RETURN(n)
do_head_br:
	br	head_complete