diff options
Diffstat (limited to 'arch/blackfin/lib')
-rw-r--r-- | arch/blackfin/lib/Makefile | 11 | ||||
-rw-r--r-- | arch/blackfin/lib/ashldi3.c | 35 | ||||
-rw-r--r-- | arch/blackfin/lib/ashrdi3.c | 36 | ||||
-rw-r--r-- | arch/blackfin/lib/divsi3.S | 199 | ||||
-rw-r--r-- | arch/blackfin/lib/gcclib.h | 24 | ||||
-rw-r--r-- | arch/blackfin/lib/ins.S | 118 | ||||
-rw-r--r-- | arch/blackfin/lib/lshrdi3.c | 35 | ||||
-rw-r--r-- | arch/blackfin/lib/memchr.S | 47 | ||||
-rw-r--r-- | arch/blackfin/lib/memcmp.S | 92 | ||||
-rw-r--r-- | arch/blackfin/lib/memcpy.S | 124 | ||||
-rw-r--r-- | arch/blackfin/lib/memmove.S | 93 | ||||
-rw-r--r-- | arch/blackfin/lib/memset.S | 87 | ||||
-rw-r--r-- | arch/blackfin/lib/modsi3.S | 57 | ||||
-rw-r--r-- | arch/blackfin/lib/muldi3.S | 74 | ||||
-rw-r--r-- | arch/blackfin/lib/outs.S | 68 | ||||
-rw-r--r-- | arch/blackfin/lib/smulsi3_highpart.S | 38 | ||||
-rw-r--r-- | arch/blackfin/lib/strcmp.S | 43 | ||||
-rw-r--r-- | arch/blackfin/lib/strcpy.S | 35 | ||||
-rw-r--r-- | arch/blackfin/lib/strncmp.S | 52 | ||||
-rw-r--r-- | arch/blackfin/lib/strncpy.S | 85 | ||||
-rw-r--r-- | arch/blackfin/lib/udivsi3.S | 277 | ||||
-rw-r--r-- | arch/blackfin/lib/umodsi3.S | 49 | ||||
-rw-r--r-- | arch/blackfin/lib/umulsi3_highpart.S | 31 |
23 files changed, 1710 insertions, 0 deletions
diff --git a/arch/blackfin/lib/Makefile b/arch/blackfin/lib/Makefile new file mode 100644 index 00000000..42c47dc9 --- /dev/null +++ b/arch/blackfin/lib/Makefile @@ -0,0 +1,11 @@ +# +# arch/blackfin/lib/Makefile +# + +lib-y := \ + ashldi3.o ashrdi3.o lshrdi3.o \ + muldi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o \ + memcpy.o memset.o memcmp.o memchr.o memmove.o \ + strcmp.o strcpy.o strncmp.o strncpy.o \ + umulsi3_highpart.o smulsi3_highpart.o \ + ins.o outs.o diff --git a/arch/blackfin/lib/ashldi3.c b/arch/blackfin/lib/ashldi3.c new file mode 100644 index 00000000..ab69d876 --- /dev/null +++ b/arch/blackfin/lib/ashldi3.c @@ -0,0 +1,35 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the GPL-2 or later. + */ + +#include "gcclib.h" + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +DItype __ashldi3(DItype u, word_type b)__attribute__((l1_text)); +#endif + +DItype __ashldi3(DItype u, word_type b) +{ + DIunion w; + word_type bm; + DIunion uu; + + if (b == 0) + return u; + + uu.ll = u; + + bm = (sizeof(SItype) * BITS_PER_UNIT) - b; + if (bm <= 0) { + w.s.low = 0; + w.s.high = (USItype) uu.s.low << -bm; + } else { + USItype carries = (USItype) uu.s.low >> bm; + w.s.low = (USItype) uu.s.low << b; + w.s.high = ((USItype) uu.s.high << b) | carries; + } + + return w.ll; +} diff --git a/arch/blackfin/lib/ashrdi3.c b/arch/blackfin/lib/ashrdi3.c new file mode 100644 index 00000000..b5b351e8 --- /dev/null +++ b/arch/blackfin/lib/ashrdi3.c @@ -0,0 +1,36 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the GPL-2 or later. + */ + +#include "gcclib.h" + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +DItype __ashrdi3(DItype u, word_type b)__attribute__((l1_text)); +#endif + +DItype __ashrdi3(DItype u, word_type b) +{ + DIunion w; + word_type bm; + DIunion uu; + + if (b == 0) + return u; + + uu.ll = u; + + bm = (sizeof(SItype) * BITS_PER_UNIT) - b; + if (bm <= 0) { + /* w.s.high = 1..1 or 0..0 */ + w.s.high = uu.s.high >> (sizeof(SItype) * BITS_PER_UNIT - 1); + w.s.low = uu.s.high >> -bm; + } else { + USItype carries = (USItype) uu.s.high << bm; + w.s.high = uu.s.high >> b; + w.s.low = ((USItype) uu.s.low >> b) | carries; + } + + return w.ll; +} diff --git a/arch/blackfin/lib/divsi3.S b/arch/blackfin/lib/divsi3.S new file mode 100644 index 00000000..f89c5a49 --- /dev/null +++ b/arch/blackfin/lib/divsi3.S @@ -0,0 +1,199 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + * + * 16 / 32 bit signed division. + * Special cases : + * 1) If(numerator == 0) + * return 0 + * 2) If(denominator ==0) + * return positive max = 0x7fffffff + * 3) If(numerator == denominator) + * return 1 + * 4) If(denominator ==1) + * return numerator + * 5) If(denominator == -1) + * return -numerator + * + * Operand : R0 - Numerator (i) + * R1 - Denominator (i) + * R0 - Quotient (o) + * Registers Used : R2-R7,P0-P2 + * + */ + +.global ___divsi3; +.type ___divsi3, STT_FUNC; + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + +.align 2; +___divsi3 : + + + R3 = R0 ^ R1; + R0 = ABS R0; + + CC = V; + + r3 = rot r3 by -1; + r1 = abs r1; /* now both positive, r3.30 means "negate result", + ** r3.31 means overflow, add one to result + */ + cc = r0 < r1; + if cc jump .Lret_zero; + r2 = r1 >> 15; + cc = r2; + if cc jump .Lidents; + r2 = r1 << 16; + cc = r2 <= r0; + if cc jump .Lidents; + + DIVS(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + DIVQ(R0, R1); + + R0 = R0.L (Z); + r1 = r3 >> 31; /* add overflow issue back in */ + r0 = r0 + r1; + r1 = -r0; + cc = bittst(r3, 30); + if cc r0 = r1; + RTS; + +/* Can't use the primitives. Test common identities. +** If the identity is true, return the value in R2. +*/ + +.Lidents: + CC = R1 == 0; /* check for divide by zero */ + IF CC JUMP .Lident_return; + + CC = R0 == 0; /* check for division of zero */ + IF CC JUMP .Lzero_return; + + CC = R0 == R1; /* check for identical operands */ + IF CC JUMP .Lident_return; + + CC = R1 == 1; /* check for divide by 1 */ + IF CC JUMP .Lident_return; + + R2.L = ONES R1; + R2 = R2.L (Z); + CC = R2 == 1; + IF CC JUMP .Lpower_of_two; + + /* Identities haven't helped either. + ** Perform the full division process. + */ + + P1 = 31; /* Set loop counter */ + + [--SP] = (R7:5); /* Push registers R5-R7 */ + R2 = -R1; + [--SP] = R2; + R2 = R0 << 1; /* R2 lsw of dividend */ + R6 = R0 ^ R1; /* Get sign */ + R5 = R6 >> 31; /* Shift sign to LSB */ + + R0 = 0 ; /* Clear msw partial remainder */ + R2 = R2 | R5; /* Shift quotient bit */ + R6 = R0 ^ R1; /* Get new quotient bit */ + + LSETUP(.Llst,.Llend) LC0 = P1; /* Setup loop */ +.Llst: R7 = R2 >> 31; /* record copy of carry from R2 */ + R2 = R2 << 1; /* Shift 64 bit dividend up by 1 bit */ + R0 = R0 << 1 || R5 = [SP]; + R0 = R0 | R7; /* and add carry */ + CC = R6 < 0; /* Check quotient(AQ) */ + /* we might be subtracting divisor (AQ==0) */ + IF CC R5 = R1; /* or we might be adding divisor (AQ==1)*/ + R0 = R0 + R5; /* do add or subtract, as indicated by AQ */ + R6 = R0 ^ R1; /* Generate next quotient bit */ + R5 = R6 >> 31; + /* Assume AQ==1, shift in zero */ + BITTGL(R5,0); /* tweak AQ to be what we want to shift in */ +.Llend: R2 = R2 + R5; /* and then set shifted-in value to + ** tweaked AQ. + */ + r1 = r3 >> 31; + r2 = r2 + r1; + cc = bittst(r3,30); + r0 = -r2; + if !cc r0 = r2; + SP += 4; + (R7:5)= [SP++]; /* Pop registers R6-R7 */ + RTS; + +.Lident_return: + CC = R1 == 0; /* check for divide by zero => 0x7fffffff */ + R2 = -1 (X); + R2 >>= 1; + IF CC JUMP .Ltrue_ident_return; + + CC = R0 == R1; /* check for identical operands => 1 */ + R2 = 1 (Z); + IF CC JUMP .Ltrue_ident_return; + + R2 = R0; /* assume divide by 1 => numerator */ + /*FALLTHRU*/ + +.Ltrue_ident_return: + R0 = R2; /* Return an identity value */ + R2 = -R2; + CC = bittst(R3,30); + IF CC R0 = R2; +.Lzero_return: + RTS; /* ...including zero */ + +.Lpower_of_two: + /* Y has a single bit set, which means it's a power of two. + ** That means we can perform the division just by shifting + ** X to the right the appropriate number of bits + */ + + /* signbits returns the number of sign bits, minus one. + ** 1=>30, 2=>29, ..., 0x40000000=>0. Which means we need + ** to shift right n-signbits spaces. It also means 0x80000000 + ** is a special case, because that *also* gives a signbits of 0 + */ + + R2 = R0 >> 31; + CC = R1 < 0; + IF CC JUMP .Ltrue_ident_return; + + R1.l = SIGNBITS R1; + R1 = R1.L (Z); + R1 += -30; + R0 = LSHIFT R0 by R1.L; + r1 = r3 >> 31; + r0 = r0 + r1; + R2 = -R0; // negate result if necessary + CC = bittst(R3,30); + IF CC R0 = R2; + RTS; + +.Lret_zero: + R0 = 0; + RTS; + +.size ___divsi3, .-___divsi3 diff --git a/arch/blackfin/lib/gcclib.h b/arch/blackfin/lib/gcclib.h new file mode 100644 index 00000000..724f07f1 --- /dev/null +++ b/arch/blackfin/lib/gcclib.h @@ -0,0 +1,24 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the GPL-2 or later. + */ + +#define BITS_PER_UNIT 8 +#define SI_TYPE_SIZE (sizeof (SItype) * BITS_PER_UNIT) + +typedef unsigned int UQItype __attribute__ ((mode(QI))); +typedef int SItype __attribute__ ((mode(SI))); +typedef unsigned int USItype __attribute__ ((mode(SI))); +typedef int DItype __attribute__ ((mode(DI))); +typedef int word_type __attribute__ ((mode(__word__))); +typedef unsigned int UDItype __attribute__ ((mode(DI))); + +struct DIstruct { + SItype low, high; +}; + +typedef union { + struct DIstruct s; + DItype ll; +} DIunion; diff --git a/arch/blackfin/lib/ins.S b/arch/blackfin/lib/ins.S new file mode 100644 index 00000000..d59608de --- /dev/null +++ b/arch/blackfin/lib/ins.S @@ -0,0 +1,118 @@ +/* + * arch/blackfin/lib/ins.S - ins{bwl} using hardware loops + * + * Copyright 2004-2008 Analog Devices Inc. + * Copyright (C) 2005 Bas Vermeulen, BuyWays BV <bas@buyways.nl> + * Licensed under the GPL-2 or later. + */ + +#include <linux/linkage.h> +#include <asm/blackfin.h> + +.align 2 + +#ifdef CONFIG_IPIPE +# define DO_CLI \ + [--sp] = rets; \ + [--sp] = (P5:0); \ + sp += -12; \ + call ___ipipe_disable_root_irqs_hw; \ + sp += 12; \ + (P5:0) = [sp++]; +# define CLI_INNER_NOP +#else +# define DO_CLI cli R3; +# define CLI_INNER_NOP nop; nop; nop; +#endif + +#ifdef CONFIG_IPIPE +# define DO_STI \ + sp += -12; \ + call ___ipipe_enable_root_irqs_hw; \ + sp += 12; \ +2: rets = [sp++]; +#else +# define DO_STI 2: sti R3; +#endif + +#ifdef CONFIG_BFIN_INS_LOWOVERHEAD +# define CLI_OUTER DO_CLI; +# define STI_OUTER DO_STI; +# define CLI_INNER 1: +# if ANOMALY_05000416 +# define STI_INNER nop; 2: nop; +# else +# define STI_INNER 2: +# endif +#else +# define CLI_OUTER +# define STI_OUTER +# define CLI_INNER 1: DO_CLI; CLI_INNER_NOP; +# define STI_INNER DO_STI; +#endif + +/* + * Reads on the Blackfin are speculative. In Blackfin terms, this means they + * can be interrupted at any time (even after they have been issued on to the + * external bus), and re-issued after the interrupt occurs. + * + * If a FIFO is sitting on the end of the read, it will see two reads, + * when the core only sees one. The FIFO receives the read which is cancelled, + * and not delivered to the core. + * + * To solve this, interrupts are turned off before reads occur to I/O space. + * There are 3 versions of all these functions + * - turns interrupts off every read (higher overhead, but lower latency) + * - turns interrupts off every loop (low overhead, but longer latency) + * - DMA version, which do not suffer from this issue. DMA versions have + * different name (prefixed by dma_ ), and are located in + * ../kernel/bfin_dma.c + * Using the dma related functions are recommended for transferring large + * buffers in/out of FIFOs. + */ + +#define COMMON_INS(func, ops) \ +ENTRY(_ins##func) \ + P0 = R0; /* P0 = port */ \ + CLI_OUTER; /* 3 instructions before first read access */ \ + P1 = R1; /* P1 = address */ \ + P2 = R2; /* P2 = count */ \ + SSYNC; \ + \ + LSETUP(1f, 2f) LC0 = P2; \ + CLI_INNER; \ + ops; \ + STI_INNER; \ + \ + STI_OUTER; \ + RTS; \ +ENDPROC(_ins##func) + +COMMON_INS(l, \ + R0 = [P0]; \ + [P1++] = R0; \ +) + +COMMON_INS(w, \ + R0 = W[P0]; \ + W[P1++] = R0; \ +) + +COMMON_INS(w_8, \ + R0 = W[P0]; \ + B[P1++] = R0; \ + R0 = R0 >> 8; \ + B[P1++] = R0; \ +) + +COMMON_INS(b, \ + R0 = B[P0]; \ + B[P1++] = R0; \ +) + +COMMON_INS(l_16, \ + R0 = [P0]; \ + W[P1++] = R0; \ + R0 = R0 >> 16; \ + W[P1++] = R0; \ +) diff --git a/arch/blackfin/lib/lshrdi3.c b/arch/blackfin/lib/lshrdi3.c new file mode 100644 index 00000000..53f17410 --- /dev/null +++ b/arch/blackfin/lib/lshrdi3.c @@ -0,0 +1,35 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the GPL-2 or later. + */ + +#include "gcclib.h" + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +DItype __lshrdi3(DItype u, word_type b)__attribute__((l1_text)); +#endif + +DItype __lshrdi3(DItype u, word_type b) +{ + DIunion w; + word_type bm; + DIunion uu; + + if (b == 0) + return u; + + uu.ll = u; + + bm = (sizeof(SItype) * BITS_PER_UNIT) - b; + if (bm <= 0) { + w.s.high = 0; + w.s.low = (USItype) uu.s.high >> -bm; + } else { + USItype carries = (USItype) uu.s.high << bm; + w.s.high = (USItype) uu.s.high >> b; + w.s.low = ((USItype) uu.s.low >> b) | carries; + } + + return w.ll; +} diff --git a/arch/blackfin/lib/memchr.S b/arch/blackfin/lib/memchr.S new file mode 100644 index 00000000..542e40f8 --- /dev/null +++ b/arch/blackfin/lib/memchr.S @@ -0,0 +1,47 @@ +/* + * Copyright 2005-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +/* void *memchr(const void *s, int c, size_t n); + * R0 = address (s) + * R1 = sought byte (c) + * R2 = count (n) + * + * Returns pointer to located character. + */ + +.text + +.align 2 + +ENTRY(_memchr) + P0 = R0; /* P0 = address */ + P2 = R2; /* P2 = count */ + R1 = R1.B(Z); + CC = R2 == 0; + IF CC JUMP .Lfailed; + +.Lbytes: + LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; + +.Lbyte_loop_s: + R3 = B[P0++](Z); + CC = R3 == R1; + IF CC JUMP .Lfound; +.Lbyte_loop_e: + NOP; + +.Lfailed: + R0=0; + RTS; + +.Lfound: + R0 = P0; + R0 += -1; + RTS; + +ENDPROC(_memchr) diff --git a/arch/blackfin/lib/memcmp.S b/arch/blackfin/lib/memcmp.S new file mode 100644 index 00000000..ce5b9f1a --- /dev/null +++ b/arch/blackfin/lib/memcmp.S @@ -0,0 +1,92 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +/* int memcmp(const void *s1, const void *s2, size_t n); + * R0 = First Address (s1) + * R1 = Second Address (s2) + * R2 = count (n) + * + * Favours word aligned data. + */ + +.text + +.align 2 + +ENTRY(_memcmp) + I1 = P3; + P0 = R0; /* P0 = s1 address */ + P3 = R1; /* P3 = s2 Address */ + P2 = R2 ; /* P2 = count */ + CC = R2 <= 7(IU); + IF CC JUMP .Ltoo_small; + I0 = R1; /* s2 */ + R1 = R1 | R0; /* OR addresses together */ + R1 <<= 30; /* check bottom two bits */ + CC = AZ; /* AZ set if zero. */ + IF !CC JUMP .Lbytes ; /* Jump if addrs not aligned. */ + + P1 = P2 >> 2; /* count = n/4 */ + R3 = 3; + R2 = R2 & R3; /* remainder */ + P2 = R2; /* set remainder */ + + LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1; +.Lquad_loop_s: +#if ANOMALY_05000202 + R0 = [P0++]; + R1 = [I0++]; +#else + MNOP || R0 = [P0++] || R1 = [I0++]; +#endif + CC = R0 == R1; + IF !CC JUMP .Lquad_different; +.Lquad_loop_e: + NOP; + + P3 = I0; /* s2 */ +.Ltoo_small: + CC = P2 == 0; /* Check zero count*/ + IF CC JUMP .Lfinished; /* very unlikely*/ + +.Lbytes: + LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2; +.Lbyte_loop_s: + R1 = B[P3++](Z); /* *s2 */ + R0 = B[P0++](Z); /* *s1 */ + CC = R0 == R1; + IF !CC JUMP .Ldifferent; +.Lbyte_loop_e: + NOP; + +.Ldifferent: + R0 = R0 - R1; + P3 = I1; + RTS; + +.Lquad_different: + /* We've read two quads which don't match. + * Can't just compare them, because we're + * a little-endian machine, so the MSBs of + * the regs occur at later addresses in the + * string. + * Arrange to re-read those two quads again, + * byte-by-byte. + */ + P0 += -4; /* back up to the start of the */ + P3 = I0; /* quads, and increase the*/ + P2 += 4; /* remainder count*/ + P3 += -4; + JUMP .Lbytes; + +.Lfinished: + R0 = 0; + P3 = I1; + RTS; + +ENDPROC(_memcmp) diff --git a/arch/blackfin/lib/memcpy.S b/arch/blackfin/lib/memcpy.S new file mode 100644 index 00000000..c31bf22a --- /dev/null +++ b/arch/blackfin/lib/memcpy.S @@ -0,0 +1,124 @@ +/* + * internal version of memcpy(), issued by the compiler to copy blocks of + * data around. This is really memmove() - it has to be able to deal with + * possible overlaps, because that ambiguity is when the compiler gives up + * and calls a function. We have our own, internal version so that we get + * something we trust, even if the user has redefined the normal symbol. + * + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +/* void *memcpy(void *dest, const void *src, size_t n); + * R0 = To Address (dest) (leave unchanged to form result) + * R1 = From Address (src) + * R2 = count + * + * Note: Favours word alignment + */ + +#ifdef CONFIG_MEMCPY_L1 +.section .l1.text +#else +.text +#endif + +.align 2 + +ENTRY(_memcpy) + CC = R2 <= 0; /* length not positive? */ + IF CC JUMP .L_P1L2147483647; /* Nothing to do */ + + P0 = R0 ; /* dst*/ + P1 = R1 ; /* src*/ + P2 = R2 ; /* length */ + + /* check for overlapping data */ + CC = R1 < R0; /* src < dst */ + IF !CC JUMP .Lno_overlap; + R3 = R1 + R2; + CC = R0 < R3; /* and dst < src+len */ + IF CC JUMP .Lhas_overlap; + +.Lno_overlap: + /* Check for aligned data.*/ + + R3 = R1 | R0; + R1 = 0x3; + R3 = R3 & R1; + CC = R3; /* low bits set on either address? */ + IF CC JUMP .Lnot_aligned; + + /* Both addresses are word-aligned, so we can copy + at least part of the data using word copies.*/ + P2 = P2 >> 2; + CC = P2 <= 2; + IF !CC JUMP .Lmore_than_seven; + /* less than eight bytes... */ + P2 = R2; + LSETUP(.Lthree_start, .Lthree_end) LC0=P2; +.Lthree_start: + R3 = B[P1++] (X); +.Lthree_end: + B[P0++] = R3; + + RTS; + +.Lmore_than_seven: + /* There's at least eight bytes to copy. */ + P2 += -1; /* because we unroll one iteration */ + LSETUP(.Lword_loops, .Lword_loope) LC0=P2; + I1 = P1; + R3 = [I1++]; +#if ANOMALY_05000202 +.Lword_loops: + [P0++] = R3; +.Lword_loope: + R3 = [I1++]; +#else +.Lword_loops: +.Lword_loope: + MNOP || [P0++] = R3 || R3 = [I1++]; +#endif + [P0++] = R3; + /* Any remaining bytes to copy? */ + R3 = 0x3; + R3 = R2 & R3; + CC = R3 == 0; + P1 = I1; /* in case there's something left, */ + IF !CC JUMP .Lbytes_left; + RTS; +.Lbytes_left: P2 = R3; +.Lnot_aligned: + /* From here, we're copying byte-by-byte. */ + LSETUP (.Lbyte_start, .Lbyte_end) LC0=P2; +.Lbyte_start: + R1 = B[P1++] (X); +.Lbyte_end: + B[P0++] = R1; + +.L_P1L2147483647: + RTS; + +.Lhas_overlap: + /* Need to reverse the copying, because the + * dst would clobber the src. + * Don't bother to work out alignment for + * the reverse case. + */ + P0 = P0 + P2; + P0 += -1; + P1 = P1 + P2; + P1 += -1; + LSETUP(.Lover_start, .Lover_end) LC0=P2; +.Lover_start: + R1 = B[P1--] (X); +.Lover_end: + B[P0--] = R1; + + RTS; + +ENDPROC(_memcpy) diff --git a/arch/blackfin/lib/memmove.S b/arch/blackfin/lib/memmove.S new file mode 100644 index 00000000..4eca5662 --- /dev/null +++ b/arch/blackfin/lib/memmove.S @@ -0,0 +1,93 @@ +/* + * Copyright 2005-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +.align 2 + +/* + * C Library function MEMMOVE + * R0 = To Address (leave unchanged to form result) + * R1 = From Address + * R2 = count + * Data may overlap + */ + +ENTRY(_memmove) + I1 = P3; + P0 = R0; /* P0 = To address */ + P3 = R1; /* P3 = From Address */ + P2 = R2; /* P2 = count */ + CC = P2 == 0; /* Check zero count*/ + IF CC JUMP .Lfinished; /* very unlikely */ + + CC = R1 < R0 (IU); /* From < To */ + IF !CC JUMP .Lno_overlap; + R3 = R1 + R2; + CC = R0 <= R3 (IU); /* (From+len) >= To */ + IF CC JUMP .Loverlap; +.Lno_overlap: + R3 = 11; + CC = R2 <= R3; + IF CC JUMP .Lbytes; + R3 = R1 | R0; /* OR addresses together */ + R3 <<= 30; /* check bottom two bits */ + CC = AZ; /* AZ set if zero.*/ + IF !CC JUMP .Lbytes; /* Jump if addrs not aligned.*/ + + I0 = P3; + P1 = P2 >> 2; /* count = n/4 */ + P1 += -1; + R3 = 3; + R2 = R2 & R3; /* remainder */ + P2 = R2; /* set remainder */ + R1 = [I0++]; + + LSETUP (.Lquad_loops, .Lquad_loope) LC0=P1; +#if ANOMALY_05000202 +.Lquad_loops: + [P0++] = R1; +.Lquad_loope: + R1 = [I0++]; +#else +.Lquad_loops: +.Lquad_loope: + MNOP || [P0++] = R1 || R1 = [I0++]; +#endif + [P0++] = R1; + + CC = P2 == 0; /* any remaining bytes? */ + P3 = I0; /* Amend P3 to updated ptr. */ + IF !CC JUMP .Lbytes; + P3 = I1; + RTS; + +.Lbytes: LSETUP (.Lbyte2_s, .Lbyte2_e) LC0=P2; +.Lbyte2_s: R1 = B[P3++](Z); +.Lbyte2_e: B[P0++] = R1; + +.Lfinished: P3 = I1; + RTS; + +.Loverlap: + P2 += -1; + P0 = P0 + P2; + P3 = P3 + P2; + R1 = B[P3--] (Z); + CC = P2 == 0; + IF CC JUMP .Lno_loop; +#if ANOMALY_05000245 + NOP; + NOP; +#endif + LSETUP (.Lol_s, .Lol_e) LC0 = P2; +.Lol_s: B[P0--] = R1; +.Lol_e: R1 = B[P3--] (Z); +.Lno_loop: B[P0] = R1; + P3 = I1; + RTS; + +ENDPROC(_memmove) diff --git a/arch/blackfin/lib/memset.S b/arch/blackfin/lib/memset.S new file mode 100644 index 00000000..eab1bef3 --- /dev/null +++ b/arch/blackfin/lib/memset.S @@ -0,0 +1,87 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +.align 2 + +#ifdef CONFIG_MEMSET_L1 +.section .l1.text +#else +.text +#endif + +/* + * C Library function MEMSET + * R0 = address (leave unchanged to form result) + * R1 = filler byte + * R2 = count + * Favours word aligned data. + * The strncpy assumes that I0 and I1 are not used in this function + */ + +ENTRY(_memset) + P0 = R0 ; /* P0 = address */ + P2 = R2 ; /* P2 = count */ + R3 = R0 + R2; /* end */ + CC = R2 <= 7(IU); + IF CC JUMP .Ltoo_small; + R1 = R1.B (Z); /* R1 = fill char */ + R2 = 3; + R2 = R0 & R2; /* addr bottom two bits */ + CC = R2 == 0; /* AZ set if zero. */ + IF !CC JUMP .Lforce_align ; /* Jump if addr not aligned. */ + +.Laligned: + P1 = P2 >> 2; /* count = n/4 */ + R2 = R1 << 8; /* create quad filler */ + R2.L = R2.L + R1.L(NS); + R2.H = R2.L + R1.H(NS); + P2 = R3; + + LSETUP (.Lquad_loop , .Lquad_loop) LC0=P1; +.Lquad_loop: + [P0++] = R2; + + CC = P0 == P2; + IF !CC JUMP .Lbytes_left; + RTS; + +.Lbytes_left: + R2 = R3; /* end point */ + R3 = P0; /* current position */ + R2 = R2 - R3; /* bytes left */ + P2 = R2; + +.Ltoo_small: + CC = P2 == 0; /* Check zero count */ + IF CC JUMP .Lfinished; /* Unusual */ + +.Lbytes: + LSETUP (.Lbyte_loop , .Lbyte_loop) LC0=P2; +.Lbyte_loop: + B[P0++] = R1; + +.Lfinished: + RTS; + +.Lforce_align: + CC = BITTST (R0, 0); /* odd byte */ + R0 = 4; + R0 = R0 - R2; + P1 = R0; + R0 = P0; /* Recover return address */ + IF !CC JUMP .Lskip1; + B[P0++] = R1; +.Lskip1: + CC = R2 <= 2; /* 2 bytes */ + P2 -= P1; /* reduce count */ + IF !CC JUMP .Laligned; + B[P0++] = R1; + B[P0++] = R1; + JUMP .Laligned; + +ENDPROC(_memset) diff --git a/arch/blackfin/lib/modsi3.S b/arch/blackfin/lib/modsi3.S new file mode 100644 index 00000000..8b0c7d40 --- /dev/null +++ b/arch/blackfin/lib/modsi3.S @@ -0,0 +1,57 @@ +/* + * This program computes 32 bit signed remainder. It calls div32 function + * for quotient estimation. + * Registers in: R0, R1 = Numerator/ Denominator + * Registers out: R0 = Remainder + * + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +.global ___modsi3; +.type ___modsi3, STT_FUNC; +.extern ___divsi3; +.type ___divsi3, STT_FUNC; + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + +___modsi3: + + CC=R0==0; + IF CC JUMP .LRETURN_R0; /* Return 0, if numerator == 0 */ + CC=R1==0; + IF CC JUMP .LRETURN_ZERO; /* Return 0, if denominator == 0 */ + CC=R0==R1; + IF CC JUMP .LRETURN_ZERO; /* Return 0, if numerator == denominator */ + CC = R1 == 1; + IF CC JUMP .LRETURN_ZERO; /* Return 0, if denominator == 1 */ + CC = R1 == -1; + IF CC JUMP .LRETURN_ZERO; /* Return 0, if denominator == -1 */ + + /* Valid input. Use __divsi3() to compute the quotient, and then + * derive the remainder from that. */ + + [--SP] = (R7:6); /* Push R7 and R6 */ + [--SP] = RETS; /* and return address */ + R7 = R0; /* Copy of R0 */ + R6 = R1; /* Save for later */ + SP += -12; /* Should always provide this space */ + CALL ___divsi3; /* Compute signed quotient using ___divsi3()*/ + SP += 12; + R0 *= R6; /* Quotient * divisor */ + R0 = R7 - R0; /* Dividend - (quotient * divisor) */ + RETS = [SP++]; /* Get back return address */ + (R7:6) = [SP++]; /* Pop registers R7 and R4 */ + RTS; /* Store remainder */ + +.LRETURN_ZERO: + R0 = 0; +.LRETURN_R0: + RTS; + +.size ___modsi3, .-___modsi3 diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S new file mode 100644 index 00000000..953a38a1 --- /dev/null +++ b/arch/blackfin/lib/muldi3.S @@ -0,0 +1,74 @@ +/* + * Copyright 2008 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +.align 2 +.global ___muldi3; +.type ___muldi3, STT_FUNC; + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + +/* + R1:R0 * R3:R2 + = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l +[X] = (R1.h * R3.h) * 2^96 +[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80 +[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64 +[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48 +[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32 +[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16 +[T4] + (R0.l * R2.l) + + We can discard the first three lines marked "X" since we produce + only a 64 bit result. So, we need ten 16-bit multiplies. + + Individual mul-acc results: +[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h +[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h +[E3] = R0.l * R2.h + R2.l * R0.h +[E4] = R0.l * R2.l + + We also need to add high parts from lower-level results to higher ones: + E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4 + + One interesting property is that all parts of the result that depend + on the sign of the multiplication are discarded. Those would be the + multiplications involving R1.h and R3.h, but only the top 16 bit of + the 32 bit result depend on the sign, and since R1.h and R3.h only + occur in E1, the top half of these results is cut off. + So, we can just use FU mode for all of the 16-bit multiplies, and + ignore questions of when to use mixed mode. */ + +___muldi3: + /* [SP] technically is part of the caller's frame, but we can + use it as scratch space. */ + A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */ + A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */ + A0 += A1; /* E1 */ + R4 = A0.w; + A0 = R0.l * R3.l (FU); /* E2 */ + A0 += R2.l * R1.l (FU); /* E2 */ + + A1 = R2.L * R0.L (FU); /* E4 */ + R3 = A1.w; + A1 = A1 >> 16; /* E3c */ + A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */ + A1 += R0.L * R2.H (FU); /* E3c */ + R0 = A1.w; + A1 = A1 >> 16; /* E2c */ + A0 += A1; /* E2c */ + R1 = A0.w; + + /* low(result) = low(E3c):low(E4) */ + R0 = PACK (R0.l, R3.l); + /* high(result) = E2c + (E1 << 16) */ + R1.h = R1.h + R4.l (NS) || R4 = [SP]; + RTS; + +.size ___muldi3, .-___muldi3 diff --git a/arch/blackfin/lib/outs.S b/arch/blackfin/lib/outs.S new file mode 100644 index 00000000..06a5e674 --- /dev/null +++ b/arch/blackfin/lib/outs.S @@ -0,0 +1,68 @@ +/* + * Implementation of outs{bwl} for BlackFin processors using zero overhead loops. + * + * Copyright 2005-2009 Analog Devices Inc. + * 2005 BuyWays BV + * Bas Vermeulen <bas@buyways.nl> + * + * Licensed under the GPL-2. + */ + +#include <linux/linkage.h> + +.align 2 + +ENTRY(_outsl) + CC = R2 == 0; + IF CC JUMP 1f; + P0 = R0; /* P0 = port */ + P1 = R1; /* P1 = address */ + P2 = R2; /* P2 = count */ + + LSETUP( .Llong_loop_s, .Llong_loop_e) LC0 = P2; +.Llong_loop_s: R0 = [P1++]; +.Llong_loop_e: [P0] = R0; +1: RTS; +ENDPROC(_outsl) + +ENTRY(_outsw) + CC = R2 == 0; + IF CC JUMP 1f; + P0 = R0; /* P0 = port */ + P1 = R1; /* P1 = address */ + P2 = R2; /* P2 = count */ + + LSETUP( .Lword_loop_s, .Lword_loop_e) LC0 = P2; +.Lword_loop_s: R0 = W[P1++]; +.Lword_loop_e: W[P0] = R0; +1: RTS; +ENDPROC(_outsw) + +ENTRY(_outsb) + CC = R2 == 0; + IF CC JUMP 1f; + P0 = R0; /* P0 = port */ + P1 = R1; /* P1 = address */ + P2 = R2; /* P2 = count */ + + LSETUP( .Lbyte_loop_s, .Lbyte_loop_e) LC0 = P2; +.Lbyte_loop_s: R0 = B[P1++]; +.Lbyte_loop_e: B[P0] = R0; +1: RTS; +ENDPROC(_outsb) + +ENTRY(_outsw_8) + CC = R2 == 0; + IF CC JUMP 1f; + P0 = R0; /* P0 = port */ + P1 = R1; /* P1 = address */ + P2 = R2; /* P2 = count */ + + LSETUP( .Lword8_loop_s, .Lword8_loop_e) LC0 = P2; +.Lword8_loop_s: R1 = B[P1++]; + R0 = B[P1++]; + R0 = R0 << 8; + R0 = R0 + R1; +.Lword8_loop_e: W[P0] = R0; +1: RTS; +ENDPROC(_outsw_8) diff --git a/arch/blackfin/lib/smulsi3_highpart.S b/arch/blackfin/lib/smulsi3_highpart.S new file mode 100644 index 00000000..99ee8c5d --- /dev/null +++ b/arch/blackfin/lib/smulsi3_highpart.S @@ -0,0 +1,38 @@ +/* + * Copyright 2007 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +.align 2 +.global ___smulsi3_highpart; +.type ___smulsi3_highpart, STT_FUNC; + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + +___smulsi3_highpart: + R2 = R1.L * R0.L (FU); + R3 = R1.H * R0.L (IS,M); + R0 = R0.H * R1.H, R1 = R0.H * R1.L (IS,M); + + R1.L = R2.H + R1.L; + cc = ac0; + R2 = cc; + + R1.L = R1.L + R3.L; + cc = ac0; + R1 >>>= 16; + R3 >>>= 16; + R1 = R1 + R3; + R1 = R1 + R2; + R2 = cc; + R1 = R1 + R2; + + R0 = R0 + R1; + RTS; + +.size ___smulsi3_highpart, .-___smulsi3_highpart diff --git a/arch/blackfin/lib/strcmp.S b/arch/blackfin/lib/strcmp.S new file mode 100644 index 00000000..d7c1d158 --- /dev/null +++ b/arch/blackfin/lib/strcmp.S @@ -0,0 +1,43 @@ +/* + * Copyright 2005-2010 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +/* void *strcmp(char *s1, const char *s2); + * R0 = address (s1) + * R1 = address (s2) + * + * Returns an integer less than, equal to, or greater than zero if s1 + * (or the first n bytes thereof) is found, respectively, to be less + * than, to match, or be greater than s2. + */ + +#ifdef CONFIG_STRCMP_L1 +.section .l1.text +#else +.text +#endif + +.align 2 + +ENTRY(_strcmp) + P0 = R0 ; /* s1 */ + P1 = R1 ; /* s2 */ + +1: + R0 = B[P0++] (Z); /* get *s1 */ + R1 = B[P1++] (Z); /* get *s2 */ + CC = R0 == R1; /* compare a byte */ + if ! cc jump 2f; /* not equal, break out */ + CC = R0; /* at end of s1? */ + if cc jump 1b (bp); /* no, keep going */ + jump.s 3f; /* strings are equal */ +2: + R0 = R0 - R1; /* *s1 - *s2 */ +3: + RTS; + +ENDPROC(_strcmp) diff --git a/arch/blackfin/lib/strcpy.S b/arch/blackfin/lib/strcpy.S new file mode 100644 index 00000000..a6a0c636 --- /dev/null +++ b/arch/blackfin/lib/strcpy.S @@ -0,0 +1,35 @@ +/* + * Copyright 2005-2010 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +/* void *strcpy(char *dest, const char *src); + * R0 = address (dest) + * R1 = address (src) + * + * Returns a pointer to the destination string dest + */ + +#ifdef CONFIG_STRCPY_L1 +.section .l1.text +#else +.text +#endif + +.align 2 + +ENTRY(_strcpy) + P0 = R0 ; /* dst*/ + P1 = R1 ; /* src*/ + +1: + R1 = B [P1++] (Z); + B [P0++] = R1; + CC = R1; + if cc jump 1b (bp); + RTS; + +ENDPROC(_strcpy) diff --git a/arch/blackfin/lib/strncmp.S b/arch/blackfin/lib/strncmp.S new file mode 100644 index 00000000..6da37c34 --- /dev/null +++ b/arch/blackfin/lib/strncmp.S @@ -0,0 +1,52 @@ +/* + * Copyright 2005-2010 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +/* void *strncpy(char *s1, const char *s2, size_t n); + * R0 = address (dest) + * R1 = address (src) + * R2 = size (n) + * Returns a pointer to the destination string dest + */ + +#ifdef CONFIG_STRNCMP_L1 +.section .l1.text +#else +.text +#endif + +.align 2 + +ENTRY(_strncmp) + CC = R2 == 0; + if CC JUMP 5f; + + P0 = R0 ; /* s1 */ + P1 = R1 ; /* s2 */ +1: + R0 = B[P0++] (Z); /* get *s1 */ + R1 = B[P1++] (Z); /* get *s2 */ + CC = R0 == R1; /* compare a byte */ + if ! cc jump 3f; /* not equal, break out */ + CC = R0; /* at end of s1? */ + if ! cc jump 4f; /* yes, all done */ + R2 += -1; /* no, adjust count */ + CC = R2 == 0; + if ! cc jump 1b (bp); /* more to do, keep going */ +2: + R0 = 0; /* strings are equal */ + jump.s 4f; +3: + R0 = R0 - R1; /* *s1 - *s2 */ +4: + RTS; + +5: + R0 = 0; + RTS; + +ENDPROC(_strncmp) diff --git a/arch/blackfin/lib/strncpy.S b/arch/blackfin/lib/strncpy.S new file mode 100644 index 00000000..2c07ddda --- /dev/null +++ b/arch/blackfin/lib/strncpy.S @@ -0,0 +1,85 @@ +/* + * Copyright 2005-2010 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> +#include <asm/context.S> + +/* void *strncpy(char *dest, const char *src, size_t n); + * R0 = address (dest) + * R1 = address (src) + * R2 = size + * Returns a pointer (R0) to the destination string dest + * we do this by not changing R0 + */ + +#ifdef CONFIG_STRNCPY_L1 +.section .l1.text +#else +.text +#endif + +.align 2 + +ENTRY(_strncpy) + CC = R2 == 0; + if CC JUMP 6f; + + P2 = R2 ; /* size */ + P0 = R0 ; /* dst*/ + P1 = R1 ; /* src*/ + + LSETUP (1f, 2f) LC0 = P2; +1: + R1 = B [P1++] (Z); + B [P0++] = R1; + CC = R1 == 0; +2: + if CC jump 3f; + + RTS; + + /* if src is shorter than n, we need to null pad bytes in dest + * but, we can get here when the last byte is zero, and we don't + * want to copy an extra byte at the end, so we need to check + */ +3: + R2 = LC0; + CC = R2 + if ! CC jump 6f; + + /* if the required null padded portion is small, do it here, rather than + * handling the overhead of memset (which is OK when things are big). + */ + R3 = 0x20; + CC = R2 < R3; + IF CC jump 4f; + + R2 += -1; + + /* Set things up for memset + * R0 = address + * R1 = filler byte (this case it's zero, set above) + * R2 = count (set above) + */ + + I1 = R0; + R0 = RETS; + I0 = R0; + R0 = P0; + pseudo_long_call _memset, p0; + R0 = I0; + RETS = R0; + R0 = I1; + RTS; + +4: + LSETUP(5f, 5f) LC0; +5: + B [P0++] = R1; +6: + RTS; + +ENDPROC(_strncpy) diff --git a/arch/blackfin/lib/udivsi3.S b/arch/blackfin/lib/udivsi3.S new file mode 100644 index 00000000..97e90431 --- /dev/null +++ b/arch/blackfin/lib/udivsi3.S @@ -0,0 +1,277 @@ +/* + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#include <linux/linkage.h> + +#define CARRY AC0 + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + + +ENTRY(___udivsi3) + + CC = R0 < R1 (IU); /* If X < Y, always return 0 */ + IF CC JUMP .Lreturn_ident; + + R2 = R1 << 16; + CC = R2 <= R0 (IU); + IF CC JUMP .Lidents; + + R2 = R0 >> 31; /* if X is a 31-bit number */ + R3 = R1 >> 15; /* and Y is a 15-bit number */ + R2 = R2 | R3; /* then it's okay to use the DIVQ builtins (fallthrough to fast)*/ + CC = R2; + IF CC JUMP .Ly_16bit; + +/* METHOD 1: FAST DIVQ + We know we have a 31-bit dividend, and 15-bit divisor so we can use the + simple divq approach (first setting AQ to 0 - implying unsigned division, + then 16 DIVQ's). +*/ + + AQ = CC; /* Clear AQ (CC==0) */ + +/* ISR States: When dividing two integers (32.0/16.0) using divide primitives, + we need to shift the dividend one bit to the left. + We have already checked that we have a 31-bit number so we are safe to do + that. +*/ + R0 <<= 1; + DIVQ(R0, R1); // 1 + DIVQ(R0, R1); // 2 + DIVQ(R0, R1); // 3 + DIVQ(R0, R1); // 4 + DIVQ(R0, R1); // 5 + DIVQ(R0, R1); // 6 + DIVQ(R0, R1); // 7 + DIVQ(R0, R1); // 8 + DIVQ(R0, R1); // 9 + DIVQ(R0, R1); // 10 + DIVQ(R0, R1); // 11 + DIVQ(R0, R1); // 12 + DIVQ(R0, R1); // 13 + DIVQ(R0, R1); // 14 + DIVQ(R0, R1); // 15 + DIVQ(R0, R1); // 16 + R0 = R0.L (Z); + RTS; + +.Ly_16bit: + /* We know that the upper 17 bits of Y might have bits set, + ** or that the sign bit of X might have a bit. If Y is a + ** 16-bit number, but not bigger, then we can use the builtins + ** with a post-divide correction. + ** R3 currently holds Y>>15, which means R3's LSB is the + ** bit we're interested in. + */ + + /* According to the ISR, to use the Divide primitives for + ** unsigned integer divide, the useable range is 31 bits + */ + CC = ! BITTST(R0, 31); + + /* IF condition is true we can scale our inputs and use the divide primitives, + ** with some post-adjustment + */ + R3 += -1; /* if so, Y is 0x00008nnn */ + CC &= AZ; + + /* If condition is true we can scale our inputs and use the divide primitives, + ** with some post-adjustment + */ + R3 = R1 >> 1; /* Pre-scaled divisor for primitive case */ + R2 = R0 >> 16; + + R2 = R3 - R2; /* shifted divisor < upper 16 bits of dividend */ + CC &= CARRY; + IF CC JUMP .Lshift_and_correct; + + /* Fall through to the identities */ + +/* METHOD 2: identities and manual calculation + We are not able to use the divide primites, but may still catch some special + cases. +*/ +.Lidents: + /* Test for common identities. Value to be returned is placed in R2. */ + CC = R0 == 0; /* 0/Y => 0 */ + IF CC JUMP .Lreturn_r0; + CC = R0 == R1; /* X==Y => 1 */ + IF CC JUMP .Lreturn_ident; + CC = R1 == 1; /* X/1 => X */ + IF CC JUMP .Lreturn_ident; + + R2.L = ONES R1; + R2 = R2.L (Z); + CC = R2 == 1; + IF CC JUMP .Lpower_of_two; + + [--SP] = (R7:5); /* Push registers R5-R7 */ + + /* Idents don't match. Go for the full operation. */ + + + R6 = 2; /* assume we'll shift two */ + R3 = 1; + + P2 = R1; + /* If either R0 or R1 have sign set, */ + /* divide them by two, and note it's */ + /* been done. */ + CC = R1 < 0; + R2 = R1 >> 1; + IF CC R1 = R2; /* Possibly-shifted R1 */ + IF !CC R6 = R3; /* R1 doesn't, so at most 1 shifted */ + + P0 = 0; + R3 = -R1; + [--SP] = R3; + R2 = R0 >> 1; + R2 = R0 >> 1; + CC = R0 < 0; + IF CC P0 = R6; /* Number of values divided */ + IF !CC R2 = R0; /* Shifted R0 */ + + /* P0 is 0, 1 (NR/=2) or 2 (NR/=2, DR/=2) */ + + /* r2 holds Copy dividend */ + R3 = 0; /* Clear partial remainder */ + R7 = 0; /* Initialise quotient bit */ + + P1 = 32; /* Set loop counter */ + LSETUP(.Lulst, .Lulend) LC0 = P1; /* Set loop counter */ +.Lulst: R6 = R2 >> 31; /* R6 = sign bit of R2, for carry */ + R2 = R2 << 1; /* Shift 64 bit dividend up by 1 bit */ + R3 = R3 << 1 || R5 = [SP]; + R3 = R3 | R6; /* Include any carry */ + CC = R7 < 0; /* Check quotient(AQ) */ + /* If AQ==0, we'll sub divisor */ + IF CC R5 = R1; /* and if AQ==1, we'll add it. */ + R3 = R3 + R5; /* Add/sub divsor to partial remainder */ + R7 = R3 ^ R1; /* Generate next quotient bit */ + + R5 = R7 >> 31; /* Get AQ */ + BITTGL(R5, 0); /* Invert it, to get what we'll shift */ +.Lulend: R2 = R2 + R5; /* and "shift" it in. */ + + CC = P0 == 0; /* Check how many inputs we shifted */ + IF CC JUMP .Lno_mult; /* if none... */ + R6 = R2 << 1; + CC = P0 == 1; + IF CC R2 = R6; /* if 1, Q = Q*2 */ + IF !CC R1 = P2; /* if 2, restore stored divisor */ + + R3 = R2; /* Copy of R2 */ + R3 *= R1; /* Q * divisor */ + R5 = R0 - R3; /* Z = (dividend - Q * divisor) */ + CC = R1 <= R5 (IU); /* Check if divisor <= Z? */ + R6 = CC; /* if yes, R6 = 1 */ + R2 = R2 + R6; /* if yes, add one to quotient(Q) */ +.Lno_mult: + SP += 4; + (R7:5) = [SP++]; /* Pop registers R5-R7 */ + R0 = R2; /* Store quotient */ + RTS; + +.Lreturn_ident: + CC = R0 < R1 (IU); /* If X < Y, always return 0 */ + R2 = 0; + IF CC JUMP .Ltrue_return_ident; + R2 = -1 (X); /* X/0 => 0xFFFFFFFF */ + CC = R1 == 0; + IF CC JUMP .Ltrue_return_ident; + R2 = -R2; /* R2 now 1 */ + CC = R0 == R1; /* X==Y => 1 */ + IF CC JUMP .Ltrue_return_ident; + R2 = R0; /* X/1 => X */ + /*FALLTHRU*/ + +.Ltrue_return_ident: + R0 = R2; +.Lreturn_r0: + RTS; + +.Lpower_of_two: + /* Y has a single bit set, which means it's a power of two. + ** That means we can perform the division just by shifting + ** X to the right the appropriate number of bits + */ + + /* signbits returns the number of sign bits, minus one. + ** 1=>30, 2=>29, ..., 0x40000000=>0. Which means we need + ** to shift right n-signbits spaces. It also means 0x80000000 + ** is a special case, because that *also* gives a signbits of 0 + */ + + R2 = R0 >> 31; + CC = R1 < 0; + IF CC JUMP .Ltrue_return_ident; + + R1.l = SIGNBITS R1; + R1 = R1.L (Z); + R1 += -30; + R0 = LSHIFT R0 by R1.L; + RTS; + +/* METHOD 3: PRESCALE AND USE THE DIVIDE PRIMITIVES WITH SOME POST-CORRECTION + Two scaling operations are required to use the divide primitives with a + divisor > 0x7FFFF. + Firstly (as in method 1) we need to shift the dividend 1 to the left for + integer division. + Secondly we need to shift both the divisor and dividend 1 to the right so + both are in range for the primitives. + The left/right shift of the dividend does nothing so we can skip it. +*/ +.Lshift_and_correct: + R2 = R0; + // R3 is already R1 >> 1 + CC=!CC; + AQ = CC; /* Clear AQ, got here with CC = 0 */ + DIVQ(R2, R3); // 1 + DIVQ(R2, R3); // 2 + DIVQ(R2, R3); // 3 + DIVQ(R2, R3); // 4 + DIVQ(R2, R3); // 5 + DIVQ(R2, R3); // 6 + DIVQ(R2, R3); // 7 + DIVQ(R2, R3); // 8 + DIVQ(R2, R3); // 9 + DIVQ(R2, R3); // 10 + DIVQ(R2, R3); // 11 + DIVQ(R2, R3); // 12 + DIVQ(R2, R3); // 13 + DIVQ(R2, R3); // 14 + DIVQ(R2, R3); // 15 + DIVQ(R2, R3); // 16 + + /* According to the Instruction Set Reference: + To divide by a divisor > 0x7FFF, + 1. prescale and perform divide to obtain quotient (Q) (done above), + 2. multiply quotient by unscaled divisor (result M) + 3. subtract the product from the divident to get an error (E = X - M) + 4. if E < divisor (Y) subtract 1, if E > divisor (Y) add 1, else return quotient (Q) + */ + R3 = R2.L (Z); /* Q = X' / Y' */ + R2 = R3; /* Preserve Q */ + R2 *= R1; /* M = Q * Y */ + R2 = R0 - R2; /* E = X - M */ + R0 = R3; /* Copy Q into result reg */ + +/* Correction: If result of the multiply is negative, we overflowed + and need to correct the result by subtracting 1 from the result.*/ + R3 = 0xFFFF (Z); + R2 = R2 >> 16; /* E >> 16 */ + CC = R2 == R3; + R3 = 1 ; + R1 = R0 - R3; + IF CC R0 = R1; + RTS; + +ENDPROC(___udivsi3) diff --git a/arch/blackfin/lib/umodsi3.S b/arch/blackfin/lib/umodsi3.S new file mode 100644 index 00000000..168eba7c --- /dev/null +++ b/arch/blackfin/lib/umodsi3.S @@ -0,0 +1,49 @@ +/* + * libgcc1 routines for Blackfin 5xx + * + * Copyright 2004-2009 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + +.extern ___udivsi3; +.type ___udivsi3, STT_FUNC; +.globl ___umodsi3 +.type ___umodsi3, STT_FUNC; +___umodsi3: + + CC=R0==0; + IF CC JUMP .LRETURN_R0; /* Return 0, if NR == 0 */ + CC= R1==0; + IF CC JUMP .LRETURN_ZERO_VAL; /* Return 0, if DR == 0 */ + CC=R0==R1; + IF CC JUMP .LRETURN_ZERO_VAL; /* Return 0, if NR == DR */ + CC = R1 == 1; + IF CC JUMP .LRETURN_ZERO_VAL; /* Return 0, if DR == 1 */ + CC = R0<R1 (IU); + IF CC JUMP .LRETURN_R0; /* Return dividend (R0),IF NR<DR */ + + [--SP] = (R7:6); /* Push registers and */ + [--SP] = RETS; /* Return address */ + R7 = R0; /* Copy of R0 */ + R6 = R1; + SP += -12; /* Should always provide this space */ + CALL ___udivsi3; /* Compute unsigned quotient using ___udiv32()*/ + SP += 12; + R0 *= R6; /* Quotient * divisor */ + R0 = R7 - R0; /* Dividend - (quotient * divisor) */ + RETS = [SP++]; /* Pop return address */ + ( R7:6) = [SP++]; /* And registers */ + RTS; /* Return remainder */ +.LRETURN_ZERO_VAL: + R0 = 0; +.LRETURN_R0: + RTS; + +.size ___umodsi3, .-___umodsi3 diff --git a/arch/blackfin/lib/umulsi3_highpart.S b/arch/blackfin/lib/umulsi3_highpart.S new file mode 100644 index 00000000..051824a6 --- /dev/null +++ b/arch/blackfin/lib/umulsi3_highpart.S @@ -0,0 +1,31 @@ +/* + * Copyright 2007 Analog Devices Inc. + * + * Licensed under the ADI BSD license or the GPL-2 (or later) + */ + +.align 2 +.global ___umulsi3_highpart; +.type ___umulsi3_highpart, STT_FUNC; + +#ifdef CONFIG_ARITHMETIC_OPS_L1 +.section .l1.text +#else +.text +#endif + +___umulsi3_highpart: + R2 = R1.H * R0.H, R3 = R1.L * R0.H (FU); + R0 = R1.L * R0.L, R1 = R1.H * R0.L (FU); + R0 >>= 16; + /* Unsigned multiplication has the nice property that we can + ignore carry on this first addition. */ + R0 = R0 + R3; + R0 = R0 + R1; + cc = ac0; + R1 = cc; + R1 = PACK(R1.l,R0.h); + R0 = R1 + R2; + RTS; + +.size ___umulsi3_highpart, .-___umulsi3_highpart |