/* -*- asm -*- */
/*
 * Copyright 2008 Free Software Foundation, Inc.
 * 
 * This file is part of GNU Radio
 * 
 * GNU Radio is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3, or (at your option)
 * any later version.
 * 
 * GNU Radio is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include "gc_spu_macs.h"

	.file "spu_memset_unaligned.S"

	/*
	 * Computes this, only a lot faster...
	 *
	 *	void *
	 *	libvector_memset_unaligned(void *pv, int c, size_t n)
	 *	{
	 *	  unsigned char *p = (unsigned char *) pv;
	 *	  size_t i;
	 *	  for (i = 0; i < n; i++)
	 *	    p[i] = c;
	 *	
	 *	  return pv;
	 *	}
	 */
	
#define	p_arg	arg1	// we're going to clobber arg1 w/ the return value
#define	c	arg2	// the constant we're writing
#define	n	arg3	// how many bytes to write

#define	p	r13	// where we're writing
#define	t0	r14
#define t1	r15
#define	mask	r16
#define	old	r17
#define an	r18	// aligned n (n rounded down to mod 16 boundary)
#define	next_p	r19
#define	cond1	r20
#define	cond2	r21				
#define m	r22
#define r	r23
	
	PROC_ENTRY(libvector_memset_unaligned)
	
	// Hint the return from do_head, in case we go that way.
	// There's pretty much nothing to can do to hint the branch to it.
	hbrr	do_head_br, head_complete
	
	MR(p, p_arg)	// leaves p, the return value, in the correct reg (r3)
	BRZ_RETURN(n)

	MODULO(t0, p, 16)	// is p%16 == 0?
	VSPLTB(c, c, 3)		// splat byte in preferred slot of c into all slots
	brnz	t0, do_head	// no, handle it
head_complete:

	/*
	 * preconditions:	
	 *   p%16 == 0, n > 0
	 */
	hbrr	middle_loop_br, middle_loop
	
	ROUND_DOWN(an, n, 16)	// an is "aligned n"
	MODULO(n, n, 16)	// what's left over in the last quad
	brz	an, do_tail	// no whole quad words; skip to tail
	clgti	t0, an, 127	// an >= 128?
	brz	t0, middle2	// nope, go handle the cases between 0 and 112

	/*
	 * 128 bytes / iteration
	 */
	.p2align 4
middle_loop:
	ai	an, an, -128
	  stqd	c,  0*16(p)
	ai	next_p, p, 128
	  stqd	c,  1*16(p)
	cgti	cond1, an, 127
	  stqd	c,  2*16(p)

	  stqd	c,  3*16(p)
	  stqd	c,  4*16(p)
	  stqd	c,  5*16(p)
	  stqd	c,  6*16(p)
	
	MR(p, next_p)
	  stqd	c,  7*16-128(next_p)
	or	cond2, n, an
middle_loop_br:
	  brnz	cond1, middle_loop
	
	/*
	 * if an and n are both zero, return now 
	 */
	BRZ_RETURN(cond2)

	/*
	 * otherwise handle last of full quad words 
	 *
	 *   0 <= an < 128, p%16 == 0
	 */
middle2:
	/*
	 * if an == 0, go handle the final non-full quadword
	 */
	brz	an, do_tail
	hbrr	middle2_loop_br, middle2_loop
	
	.p2align 3
middle2_loop:	
	ai	next_p, p, 16
	  stqd	c, 0(p)
	ai	an, an, -16
	  LMR(p, next_p)
middle2_loop_br:
	  brnz	an, middle2_loop
	
	/* We're done with the full quadwords. */
	
	/*
	 * Handle the final partial quadword.
	 * We'll be modifying only the left hand portion of the quad.
	 *
	 * preconditions:
	 *   an == 0, 0 <= n < 16, p%16 == 0
	 */
do_tail:
	HINT_RETURN(do_tail_ret)
	il	mask, -1
	sfi	t1, n, 16		// t1 = 16 - n
	lqd	old, 0(p)
	shlqby  mask, mask, t1
	selb	t0, old, c, mask
	stqd	t0, 0(p)
do_tail_ret:	
	RETURN()

	/*
	 * ----------------------------------------------------------------
	 * Handle the first partial quadword
	 *
	 * preconditions:
	 *   p%16 != 0
	 *
         * postconditions:
         *   p%16 == 0 or n == 0
         *
         *        |-- m --|
         *     +----------------+----------------+
         *     |  ////////      |                |
         *     +----------------+----------------+
         *        |----- r -----|
         *        p
         * ----------------------------------------------------------------
	 */
do_head:
	lqd	old, 0(p)
	MODULO_NEG(r, p, 16)
	il	mask, -1
	UMIN(m, r, n)
	shlqby	mask, mask, m	// 1's in the top, m*8 0's in the bottom
	MR(t1, p)
	sf	t0, m, r	// t0 = r - m
	a	p, p, m		// p += m
	rotqby	mask, mask, t0	// rotate 0's to the right place	
	sf	n, m, n		// n -= m
	selb	t0, c, old, mask // merge
	stqd	t0, 0(t1)
	BRZ_RETURN(n)
do_head_br:
	br	head_complete