From d5e5244a9ce8c5c3847c133c1107fba2af0e90c3 Mon Sep 17 00:00:00 2001 From: eb Date: Sun, 4 May 2008 06:45:19 +0000 Subject: Merged eb/gcell-wip -r8302:8307 into trunk. This changeset includes an example of SPE assembler (memset.S), a new file of macros for use with SPE assembler (gc_spu_macs.h), and an extended QA framework. The easy to use QA framework adds support for SPE utility code that's usually not called from the PPE, such as memset. See qa_gcell_general.{h,cc} and qa_memset.c for example usage. memset achieves 44GB/s on the SPE. That's within 3% of ideal. git-svn-id: http://gnuradio.org/svn/gnuradio/trunk@8308 221aa14e-8319-0410-a670-987f0aec2ac5 --- gcell/src/lib/general/spu/gc_spu_macs.h | 380 ++++++++++++++++++++++++++++++++ gcell/src/lib/general/spu/memset.S | 185 ++++++++++++++++ gcell/src/lib/general/spu/qa_memset.c | 201 +++++++++++++++++ 3 files changed, 766 insertions(+) create mode 100644 gcell/src/lib/general/spu/gc_spu_macs.h create mode 100644 gcell/src/lib/general/spu/memset.S create mode 100644 gcell/src/lib/general/spu/qa_memset.c (limited to 'gcell/src/lib/general/spu') diff --git a/gcell/src/lib/general/spu/gc_spu_macs.h b/gcell/src/lib/general/spu/gc_spu_macs.h new file mode 100644 index 000000000..8e3e3f2a6 --- /dev/null +++ b/gcell/src/lib/general/spu/gc_spu_macs.h @@ -0,0 +1,380 @@ +/* -*- asm -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef INCLUDED_GC_SPU_MACS_H +#define INCLUDED_GC_SPU_MACS_H + +/* + * This file contains a set of macros that are generally useful when + * coding in SPU assembler + * + * Note that the multi-instruction macros in here may overwrite + * registers 77, 78, and 79 without warning. + */ + +/* + * defines for all registers + */ +#define r0 $0 +#define r1 $1 +#define r2 $2 +#define r3 $3 +#define r4 $4 +#define r5 $5 +#define r6 $6 +#define r7 $7 +#define r8 $8 +#define r9 $9 +#define r10 $10 +#define r11 $11 +#define r12 $12 +#define r13 $13 +#define r14 $14 +#define r15 $15 +#define r16 $16 +#define r17 $17 +#define r18 $18 +#define r19 $19 +#define r20 $20 +#define r21 $21 +#define r22 $22 +#define r23 $23 +#define r24 $24 +#define r25 $25 +#define r26 $26 +#define r27 $27 +#define r28 $28 +#define r29 $29 +#define r30 $30 +#define r31 $31 +#define r32 $32 +#define r33 $33 +#define r34 $34 +#define r35 $35 +#define r36 $36 +#define r37 $37 +#define r38 $38 +#define r39 $39 +#define r40 $40 +#define r41 $41 +#define r42 $42 +#define r43 $43 +#define r44 $44 +#define r45 $45 +#define r46 $46 +#define r47 $47 +#define r48 $48 +#define r49 $49 +#define r50 $50 +#define r51 $51 +#define r52 $52 +#define r53 $53 +#define r54 $54 +#define r55 $55 +#define r56 $56 +#define r57 $57 +#define r58 $58 +#define r59 $59 +#define r60 $60 +#define r61 $61 +#define r62 $62 +#define r63 $63 +#define r64 $64 +#define r65 $65 +#define r66 $66 +#define r67 $67 +#define r68 $68 +#define r69 $69 +#define r70 $70 +#define r71 $71 +#define r72 $72 +#define r73 $73 +#define r74 $74 +#define r75 $75 +#define r76 $76 +#define r77 $77 +#define r78 $78 +#define r79 $79 +#define r80 $80 +#define r81 $81 +#define r82 $82 +#define r83 $83 +#define r84 $84 +#define r85 $85 +#define r86 $86 +#define r87 $87 +#define r88 $88 +#define r89 $89 +#define r90 $90 +#define r91 $91 +#define r92 $92 +#define r93 $93 +#define r94 $94 +#define r95 $95 +#define r96 $96 +#define r97 $97 +#define r98 $98 +#define r99 $99 +#define r100 $100 +#define r101 $101 +#define r102 $102 +#define r103 $103 +#define r104 $104 +#define r105 $105 +#define r106 $106 +#define r107 $107 +#define r108 $108 +#define r109 $109 +#define r110 $110 +#define r111 $111 +#define r112 $112 +#define r113 $113 +#define r114 $114 +#define r115 $115 +#define r116 $116 +#define r117 $117 +#define r118 $118 +#define r119 $119 +#define r120 $120 +#define r121 $121 +#define r122 $122 +#define r123 $123 +#define r124 $124 +#define r125 $125 +#define r126 $126 +#define r127 $127 + + +#define lr r0 // link register +#define sp r1 // stack pointer + // r2 is environment pointer for langs that need it (ALGOL) + +#define retval r3 // return values are passed in regs starting at r3 + +#define arg1 r3 // args are passed in regs starting at r3 +#define arg2 r4 +#define arg3 r5 +#define arg4 r6 +#define arg5 r7 +#define arg6 r8 +#define arg7 r9 +#define arg8 r10 +#define arg9 r11 +#define arg10 r12 + +// r3 - r74 are volatile (caller saves) +// r74 - r79 are volatile (scratch regs possibly destroyed by fct prolog/epilog) +// r80 - r127 are non-volatile (caller-saves) + +// scratch registers reserved for use by the macros in this file. + +#define _gc_t0 r79 +#define _gc_t1 r78 +#define _gc_t2 r77 + +/* + * ---------------------------------------------------------------- + * pseudo ops + * ---------------------------------------------------------------- + */ +#define PROC_ENTRY(name) \ + .text; \ + .p2align 4; \ + .global name; \ + .type name, @function; \ +name: + +/* + * ---------------------------------------------------------------- + * aliases for common operations + * ---------------------------------------------------------------- + */ + +// Move register (even pipe, 2 cycles) +#define MR(rt, ra) or rt, ra, ra; + +// Move register (odd pipe, 4 cycles) +#define LMR(rt, ra) rotqbyi rt, ra, 0; + +// return +#define RETURN() bi lr; + +// hint for a return +#define HINT_RETURN(ret_label) hbr ret_label, lr; + +// return if zero +#define BRZ_RETURN(rt) biz rt, lr; + +// return if not zero +#define BRNZ_RETURN(rt) binz rt, lr; + +// return if halfword zero +#define BRHZ_RETURN(rt) bihz rt, lr; + +// return if halfword not zero +#define BRHNZ_RETURN(rt) bihnz rt, lr; + + +/* + * ---------------------------------------------------------------- + * modulo like things for constant moduli that are powers of 2 + * ---------------------------------------------------------------- + */ + +// rt = ra & (pow2 - 1) +#define MODULO(rt, ra, pow2) \ + andi rt, ra, (pow2)-1; + +// rt = pow2 - (ra & (pow2 - 1)) +#define MODULO_NEG(rt, ra, pow2) \ + andi rt, ra, (pow2)-1; \ + sfi rt, rt, (pow2); + +// rt = ra & -(pow2) +#define ROUND_DOWN(rt, ra, pow2) \ + andi rt, ra, -(pow2); + +// rt = (ra + (pow2 - 1)) & -(pow2) +#define ROUND_UP(rt, ra, pow2) \ + ai rt, ra, (pow2)-1; \ + andi rt, rt, -(pow2); + +/* + * ---------------------------------------------------------------- + * Splat - replicate a particular slot into all slots + * Altivec analogs... + * ---------------------------------------------------------------- + */ + +// replicate byte from slot s [0,15] +#define VSPLTB(rt, ra, s) \ + ilh _gc_t0, (s)*0x0101; \ + shufb rt, ra, ra, _gc_t0; + +// replicate halfword from slot s [0,7] +#define VSPLTH(rt, ra, s) \ + ilh _gc_t0, 2*(s)*0x0101 + 0x0001; \ + shufb rt, ra, ra, _gc_t0; + +// replicate word from slot s [0,3] +#define VSPLTW(rt, ra, s) \ + iluh _gc_t0, 4*(s)*0x0101 + 0x0001; \ + iohl _gc_t0, 4*(s)*0x0101 + 0x0203; \ + shufb rt, ra, ra, _gc_t0; + +// replicate double from slot s [0,1] +#define VSPLTD(rt, ra, s) \ + /* sp is always 16-byte aligned */ \ + cdd _gc_t0, 8(sp); /* 0x10111213 14151617 00010203 04050607 */ \ + rotqbyi rt, ra, ra, (s) << 3; /* rotate double into preferred slot */ \ + shufb rt, rt, rt, _gc_t0; + +/* + * ---------------------------------------------------------------- + * lots of min/max variations... + * + * On a slot by slot basis, compute the min or max + * + * U - unsigned, else signed + * B,H,{} - byte, halfword, word + * F float + * ---------------------------------------------------------------- + */ + +#define MIN_SELB(rt, ra, rb, rc) selb rt, ra, rb, rc; +#define MAX_SELB(rt, ra, rb, rc) selb rt, rb, ra, rc; + + // words + +#define MIN(rt, ra, rb) \ + cgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAX(rt, ra, rb) \ + cgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMIN(rt, ra, rb) \ + clgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAX(rt, ra, rb) \ + clgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // bytes + +#define MINB(rt, ra, rb) \ + cgtb _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAXB(rt, ra, rb) \ + cgtb _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMINB(rt, ra, rb) \ + clgtb _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAXB(rt, ra, rb) \ + clgtb _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // halfwords + +#define MINH(rt, ra, rb) \ + cgth _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define MAXH(rt, ra, rb) \ + cgth _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +#define UMINH(rt, ra, rb) \ + clgth _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define UMAXH(rt, ra, rb) \ + clgth _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + // floats + +#define FMIN(rt, ra, rb) \ + fcgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +#define FMAX(rt, ra, rb) \ + fcgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + +// Ignoring the sign, select the values with the minimum magnitude +#define FMINMAG(rt, ra, rb) \ + fcmgt _gc_t0, ra, rb; \ + MIN_SELB(rt, ra, rb, _gc_t0) + +// Ignoring the sign, select the values with the maximum magnitude +#define FMAXMAG(rt, ra, rb) \ + fcmgt _gc_t0, ra, rb; \ + MAX_SELB(rt, ra, rb, _gc_t0) + + +#endif /* INCLUDED_GC_SPU_MACS_H */ diff --git a/gcell/src/lib/general/spu/memset.S b/gcell/src/lib/general/spu/memset.S new file mode 100644 index 000000000..88e2dbea1 --- /dev/null +++ b/gcell/src/lib/general/spu/memset.S @@ -0,0 +1,185 @@ +/* -*- asm -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + + .file "memset.S" + + /* + * Computes this, only a lot faster... + * + * void * + * memset(void *pv, int c, size_t n) + * { + * unsigned char *p = (unsigned char *) pv; + * size_t i; + * for (i = 0; i < n; i++) + * p[i] = c; + * + * return pv; + * } + */ + +#define p_arg arg1 // we're going to clobber arg1 w/ the return value +#define c arg2 // the constant we're writing +#define n arg3 // how many bytes to write + +#define p r13 // where we're writing +#define t0 r14 +#define t1 r15 +#define mask r16 +#define old r17 +#define an r18 // aligned n (n rounded down to mod 16 boundary) +#define next_p r19 +#define cond1 r20 +#define cond2 r21 +#define m r22 +#define r r23 + + PROC_ENTRY(memset) + + // Hint the return from do_head, in case we go that way. + // There's pretty much nothing to can do to hint the branch to it. + hbrr do_head_br, head_complete + + MR(p, p_arg) // leaves p, the return value, in the correct reg (r3) + BRZ_RETURN(n) + + MODULO(t0, p, 16) // is p%16 == 0? + VSPLTB(c, c, 3) // splat byte in preferred slot of c into all slots + brnz t0, do_head // no, handle it +head_complete: + + /* + * preconditions: + * p%16 == 0, n > 0 + */ + hbrr middle_loop_br, middle_loop + + ROUND_DOWN(an, n, 16) // an is "aligned n" + MODULO(n, n, 16) // what's left over in the last quad + brz an, do_tail // no whole quad words; skip to tail + clgti t0, an, 127 // an >= 128? + brz t0, middle2 // nope, go handle the cases between 0 and 112 + + /* + * 128 bytes / iteration + */ + .p2align 4 +middle_loop: + ai an, an, -128 + stqd c, 0*16(p) + ai next_p, p, 128 + stqd c, 1*16(p) + cgti cond1, an, 127 + stqd c, 2*16(p) + + stqd c, 3*16(p) + stqd c, 4*16(p) + stqd c, 5*16(p) + stqd c, 6*16(p) + + MR(p, next_p) + stqd c, 7*16-128(next_p) + or cond2, n, an +middle_loop_br: + brnz cond1, middle_loop + + /* + * if an and n are both zero, return now + */ + BRZ_RETURN(cond2) + + /* + * otherwise handle last of full quad words + * + * 0 <= an < 128, p%16 == 0 + */ +middle2: + /* + * if an == 0, go handle the final non-full quadword + */ + brz an, do_tail + hbrr middle2_loop_br, middle2_loop + + .p2align 3 +middle2_loop: + ai next_p, p, 16 + stqd c, 0(p) + ai an, an, -16 + LMR(p, next_p) +middle2_loop_br: + brnz an, middle2_loop + + /* We're done with the full quadwords. */ + + /* + * Handle the final partial quadword. + * We'll be modifying only the left hand portion of the quad. + * + * preconditions: + * an == 0, 0 <= n < 16, p%16 == 0 + */ +do_tail: + HINT_RETURN(do_tail_ret) + il mask, -1 + sfi t1, n, 16 // t1 = 16 - n + lqd old, 0(p) + shlqby mask, mask, t1 + selb t0, old, c, mask + stqd t0, 0(p) +do_tail_ret: + RETURN() + + /* + * ---------------------------------------------------------------- + * Handle the first partial quadword + * + * preconditions: + * p%16 != 0 + * + * postconditions: + * p%16 == 0 or n == 0 + * + * |-- m --| + * +----------------+----------------+ + * | //////// | | + * +----------------+----------------+ + * |----- r -----| + * p + * ---------------------------------------------------------------- + */ +do_head: + lqd old, 0(p) + MODULO_NEG(r, p, 16) + il mask, -1 + UMIN(m, r, n) + shlqby mask, mask, m // 1's in the top, m*8 0's in the bottom + MR(t1, p) + sf t0, m, r // t0 = r - m + a p, p, m // p += m + rotqby mask, mask, t0 // rotate 0's to the right place + sf n, m, n // n -= m + selb t0, c, old, mask // merge + stqd t0, 0(t1) + BRZ_RETURN(n) +do_head_br: + br head_complete diff --git a/gcell/src/lib/general/spu/qa_memset.c b/gcell/src/lib/general/spu/qa_memset.c new file mode 100644 index 000000000..0d35a423f --- /dev/null +++ b/gcell/src/lib/general/spu/qa_memset.c @@ -0,0 +1,201 @@ +/* -*- c++ -*- */ +/* + * Copyright 2008 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include + + +#define MAX_QA_BYTES 1024 +#define MAX_OFFSET 32 +#define ALIGNMENT 16 +#define K 0xA5 + +// FIXME should be passed at gcell init time +//static const int TIMEBASE = 79800000; // ps3 +static const int TIMEBASE = 26666666; // qs21 + +typedef void* (*memset_fptr)(void *s, int val, size_t n); + +void * +memset_ref(void *sv, int c, size_t n) +{ + unsigned char *s = (unsigned char *) sv; + size_t i; + for (i = 0; i < n; i++) + s[i] = c; + + return sv; +} + +static bool +check_before(unsigned char *buf, size_t len, size_t offset) +{ + unsigned char *p = buf + sizeof(vector unsigned char) + offset; + bool ok = true; + int i; + + for (i = -16; i < 0; i++){ + unsigned char expected = (&p[i] - buf) & 0xff; + if (p[i] != expected){ + printf("b:memset(%p, 0x%x, %zu) [%3d] expected %02x, got %02x\n", + p, K, len, offset, i, K, p[i]); + ok = false; + } + } + return ok; +} + +static bool +check_middle(unsigned char *buf, size_t len, size_t offset) +{ + unsigned char *p = buf + sizeof(vector unsigned char) + offset; + bool ok = true; + size_t i; + + for (i = 0; i < len; i++){ + unsigned char expected = K; + if (p[i] != expected){ + printf("m:memset(%p, 0x%x, %zu) [%3zd] expected %02x, got %02x\n", + p, K, len, offset, i, expected, p[i]); + ok = false; + } + } + return ok; +} + +static bool +check_after(unsigned char *buf, size_t len, size_t offset) +{ + unsigned char *p = buf + sizeof(vector unsigned char) + offset; + bool ok = true; + size_t i; + + for (i = len; i < len + 16; i++){ + unsigned char expected = (&p[i] - buf) & 0xff; + if (p[i] != expected){ + printf("a:memset(%p, 0x%x, %zu) [%3zd] expected %02x, got %02x\n", + p, K, len, offset, i, expected, p[i]); + ok = false; + } + } + return ok; +} + + +static bool +test_memset_aux(memset_fptr f, + unsigned char *buf, size_t buflen, size_t len, size_t offset) +{ + size_t i; + + // init buffer to non-zero known state + for (i = 0; i < buflen; i++) + buf[i] = i; + + // Our working buffer. Starts 16 bytes + offset into buf. + // We offset by 16 so that we can see if data before is getting damaged. + unsigned char *p = buf + sizeof(vector unsigned char) + offset; + + (*f)(p, K, len); + + bool ok = true; + ok &= check_before(buf, len, offset); + ok &= check_middle(buf, len, offset); + ok &= check_after(buf, len, offset); + + return ok; +} + +bool +test_memset(memset_fptr f) +{ + size_t BUFLEN = MAX_QA_BYTES + 2*sizeof(vector unsigned char) + MAX_OFFSET; + unsigned char unaligned_buf[BUFLEN + ALIGNMENT -1]; + unsigned char *aligned_buf = + (unsigned char *)((((intptr_t) unaligned_buf) + ALIGNMENT - 1) & -ALIGNMENT); + + // printf("unaligned = %p\n", unaligned_buf); + // printf("aligned = %p\n", aligned_buf); + + size_t len; + size_t offset; + bool ok = true; + + for (len = 0; len < MAX_QA_BYTES; len++){ + for (offset = 0; offset <= MAX_OFFSET; offset++){ + ok &= test_memset_aux(f, aligned_buf, BUFLEN, len, offset); + } + } + + return ok; +} + +// returns bytes/s +float +benchmark_memset(memset_fptr f, bool aligned) +{ + static const int SIZE = 32768; + unsigned char buf[SIZE]; + uint32_t t0, t1; + int nbytes; + + spu_write_decrementer(0xffffffff); + + if (aligned){ + nbytes = SIZE; + t0 = spu_read_decrementer(); + (*f)(buf, 0x55, nbytes); + (*f)(buf, 0x55, nbytes); + (*f)(buf, 0x55, nbytes); + (*f)(buf, 0x55, nbytes); + t1 = spu_read_decrementer(); + } + else { + nbytes = SIZE - 2; + t0 = spu_read_decrementer(); + (*f)(buf + 1, 0x55, nbytes); + (*f)(buf + 1, 0x55, nbytes); + (*f)(buf + 1, 0x55, nbytes); + (*f)(buf + 1, 0x55, nbytes); + t1 = spu_read_decrementer(); + } + + //printf("delta ticks: %d\n", t0 - t1); + return (float) nbytes * 4 / ((t0 - t1) * 1.0/TIMEBASE); +} + +/* + * Implement the standard QA stub. + * No input arguments, 1 bool output. + */ +static void +gcs_qa_memset(const gc_job_direct_args_t *input _UNUSED, + gc_job_direct_args_t *output, + const gc_job_ea_args_t *eaa _UNUSED) +{ + bool ok = test_memset(memset); + output->arg[0].u32 = ok; +} + +GC_DECLARE_PROC(gcs_qa_memset, "qa_memset"); -- cgit