diff options
Diffstat (limited to 'ANDROID_3.4.5/arch/powerpc/lib/copyuser_power7.S')
-rw-r--r-- | ANDROID_3.4.5/arch/powerpc/lib/copyuser_power7.S | 683 |
1 files changed, 0 insertions, 683 deletions
diff --git a/ANDROID_3.4.5/arch/powerpc/lib/copyuser_power7.S b/ANDROID_3.4.5/arch/powerpc/lib/copyuser_power7.S deleted file mode 100644 index 497db7b2..00000000 --- a/ANDROID_3.4.5/arch/powerpc/lib/copyuser_power7.S +++ /dev/null @@ -1,683 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2011 - * - * Author: Anton Blanchard <anton@au.ibm.com> - */ -#include <asm/ppc_asm.h> - -#define STACKFRAMESIZE 256 -#define STK_REG(i) (112 + ((i)-14)*8) - - .macro err1 -100: - .section __ex_table,"a" - .align 3 - .llong 100b,.Ldo_err1 - .previous - .endm - - .macro err2 -200: - .section __ex_table,"a" - .align 3 - .llong 200b,.Ldo_err2 - .previous - .endm - -#ifdef CONFIG_ALTIVEC - .macro err3 -300: - .section __ex_table,"a" - .align 3 - .llong 300b,.Ldo_err3 - .previous - .endm - - .macro err4 -400: - .section __ex_table,"a" - .align 3 - .llong 400b,.Ldo_err4 - .previous - .endm - - -.Ldo_err4: - ld r16,STK_REG(r16)(r1) - ld r15,STK_REG(r15)(r1) - ld r14,STK_REG(r14)(r1) -.Ldo_err3: - bl .exit_vmx_copy - ld r0,STACKFRAMESIZE+16(r1) - mtlr r0 - b .Lexit -#endif /* CONFIG_ALTIVEC */ - -.Ldo_err2: - ld r22,STK_REG(r22)(r1) - ld r21,STK_REG(r21)(r1) - ld r20,STK_REG(r20)(r1) - ld r19,STK_REG(r19)(r1) - ld r18,STK_REG(r18)(r1) - ld r17,STK_REG(r17)(r1) - ld r16,STK_REG(r16)(r1) - ld r15,STK_REG(r15)(r1) - ld r14,STK_REG(r14)(r1) -.Lexit: - addi r1,r1,STACKFRAMESIZE -.Ldo_err1: - ld r3,48(r1) - ld r4,56(r1) - ld r5,64(r1) - b __copy_tofrom_user_base - - -_GLOBAL(__copy_tofrom_user_power7) -#ifdef CONFIG_ALTIVEC - cmpldi r5,16 - cmpldi cr1,r5,4096 - - std r3,48(r1) - std r4,56(r1) - std r5,64(r1) - - blt .Lshort_copy - bgt cr1,.Lvmx_copy -#else - cmpldi r5,16 - - std r3,48(r1) - std r4,56(r1) - std r5,64(r1) - - blt .Lshort_copy -#endif - -.Lnonvmx_copy: - /* Get the source 8B aligned */ - neg r6,r4 - mtocrf 0x01,r6 - clrldi r6,r6,(64-3) - - bf cr7*4+3,1f -err1; lbz r0,0(r4) - addi r4,r4,1 -err1; stb r0,0(r3) - addi r3,r3,1 - -1: bf cr7*4+2,2f -err1; lhz r0,0(r4) - addi r4,r4,2 -err1; sth r0,0(r3) - addi r3,r3,2 - -2: bf cr7*4+1,3f -err1; lwz r0,0(r4) - addi r4,r4,4 -err1; stw r0,0(r3) - addi r3,r3,4 - -3: sub r5,r5,r6 - cmpldi r5,128 - blt 5f - - mflr r0 - stdu r1,-STACKFRAMESIZE(r1) - std r14,STK_REG(r14)(r1) - std r15,STK_REG(r15)(r1) - std r16,STK_REG(r16)(r1) - std r17,STK_REG(r17)(r1) - std r18,STK_REG(r18)(r1) - std r19,STK_REG(r19)(r1) - std r20,STK_REG(r20)(r1) - std r21,STK_REG(r21)(r1) - std r22,STK_REG(r22)(r1) - std r0,STACKFRAMESIZE+16(r1) - - srdi r6,r5,7 - mtctr r6 - - /* Now do cacheline (128B) sized loads and stores. */ - .align 5 -4: -err2; ld r0,0(r4) -err2; ld r6,8(r4) -err2; ld r7,16(r4) -err2; ld r8,24(r4) -err2; ld r9,32(r4) -err2; ld r10,40(r4) -err2; ld r11,48(r4) -err2; ld r12,56(r4) -err2; ld r14,64(r4) -err2; ld r15,72(r4) -err2; ld r16,80(r4) -err2; ld r17,88(r4) -err2; ld r18,96(r4) -err2; ld r19,104(r4) -err2; ld r20,112(r4) -err2; ld r21,120(r4) - addi r4,r4,128 -err2; std r0,0(r3) -err2; std r6,8(r3) -err2; std r7,16(r3) -err2; std r8,24(r3) -err2; std r9,32(r3) -err2; std r10,40(r3) -err2; std r11,48(r3) -err2; std r12,56(r3) -err2; std r14,64(r3) -err2; std r15,72(r3) -err2; std r16,80(r3) -err2; std r17,88(r3) -err2; std r18,96(r3) -err2; std r19,104(r3) -err2; std r20,112(r3) -err2; std r21,120(r3) - addi r3,r3,128 - bdnz 4b - - clrldi r5,r5,(64-7) - - ld r14,STK_REG(r14)(r1) - ld r15,STK_REG(r15)(r1) - ld r16,STK_REG(r16)(r1) - ld r17,STK_REG(r17)(r1) - ld r18,STK_REG(r18)(r1) - ld r19,STK_REG(r19)(r1) - ld r20,STK_REG(r20)(r1) - ld r21,STK_REG(r21)(r1) - ld r22,STK_REG(r22)(r1) - addi r1,r1,STACKFRAMESIZE - - /* Up to 127B to go */ -5: srdi r6,r5,4 - mtocrf 0x01,r6 - -6: bf cr7*4+1,7f -err1; ld r0,0(r4) -err1; ld r6,8(r4) -err1; ld r7,16(r4) -err1; ld r8,24(r4) -err1; ld r9,32(r4) -err1; ld r10,40(r4) -err1; ld r11,48(r4) -err1; ld r12,56(r4) - addi r4,r4,64 -err1; std r0,0(r3) -err1; std r6,8(r3) -err1; std r7,16(r3) -err1; std r8,24(r3) -err1; std r9,32(r3) -err1; std r10,40(r3) -err1; std r11,48(r3) -err1; std r12,56(r3) - addi r3,r3,64 - - /* Up to 63B to go */ -7: bf cr7*4+2,8f -err1; ld r0,0(r4) -err1; ld r6,8(r4) -err1; ld r7,16(r4) -err1; ld r8,24(r4) - addi r4,r4,32 -err1; std r0,0(r3) -err1; std r6,8(r3) -err1; std r7,16(r3) -err1; std r8,24(r3) - addi r3,r3,32 - - /* Up to 31B to go */ -8: bf cr7*4+3,9f -err1; ld r0,0(r4) -err1; ld r6,8(r4) - addi r4,r4,16 -err1; std r0,0(r3) -err1; std r6,8(r3) - addi r3,r3,16 - -9: clrldi r5,r5,(64-4) - - /* Up to 15B to go */ -.Lshort_copy: - mtocrf 0x01,r5 - bf cr7*4+0,12f -err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ -err1; lwz r6,4(r4) - addi r4,r4,8 -err1; stw r0,0(r3) -err1; stw r6,4(r3) - addi r3,r3,8 - -12: bf cr7*4+1,13f -err1; lwz r0,0(r4) - addi r4,r4,4 -err1; stw r0,0(r3) - addi r3,r3,4 - -13: bf cr7*4+2,14f -err1; lhz r0,0(r4) - addi r4,r4,2 -err1; sth r0,0(r3) - addi r3,r3,2 - -14: bf cr7*4+3,15f -err1; lbz r0,0(r4) -err1; stb r0,0(r3) - -15: li r3,0 - blr - -.Lunwind_stack_nonvmx_copy: - addi r1,r1,STACKFRAMESIZE - b .Lnonvmx_copy - -#ifdef CONFIG_ALTIVEC -.Lvmx_copy: - mflr r0 - std r0,16(r1) - stdu r1,-STACKFRAMESIZE(r1) - bl .enter_vmx_copy - cmpwi r3,0 - ld r0,STACKFRAMESIZE+16(r1) - ld r3,STACKFRAMESIZE+48(r1) - ld r4,STACKFRAMESIZE+56(r1) - ld r5,STACKFRAMESIZE+64(r1) - mtlr r0 - - beq .Lunwind_stack_nonvmx_copy - - /* - * If source and destination are not relatively aligned we use a - * slower permute loop. - */ - xor r6,r4,r3 - rldicl. r6,r6,0,(64-4) - bne .Lvmx_unaligned_copy - - /* Get the destination 16B aligned */ - neg r6,r3 - mtocrf 0x01,r6 - clrldi r6,r6,(64-4) - - bf cr7*4+3,1f -err3; lbz r0,0(r4) - addi r4,r4,1 -err3; stb r0,0(r3) - addi r3,r3,1 - -1: bf cr7*4+2,2f -err3; lhz r0,0(r4) - addi r4,r4,2 -err3; sth r0,0(r3) - addi r3,r3,2 - -2: bf cr7*4+1,3f -err3; lwz r0,0(r4) - addi r4,r4,4 -err3; stw r0,0(r3) - addi r3,r3,4 - -3: bf cr7*4+0,4f -err3; ld r0,0(r4) - addi r4,r4,8 -err3; std r0,0(r3) - addi r3,r3,8 - -4: sub r5,r5,r6 - - /* Get the desination 128B aligned */ - neg r6,r3 - srdi r7,r6,4 - mtocrf 0x01,r7 - clrldi r6,r6,(64-7) - - li r9,16 - li r10,32 - li r11,48 - - bf cr7*4+3,5f -err3; lvx vr1,r0,r4 - addi r4,r4,16 -err3; stvx vr1,r0,r3 - addi r3,r3,16 - -5: bf cr7*4+2,6f -err3; lvx vr1,r0,r4 -err3; lvx vr0,r4,r9 - addi r4,r4,32 -err3; stvx vr1,r0,r3 -err3; stvx vr0,r3,r9 - addi r3,r3,32 - -6: bf cr7*4+1,7f -err3; lvx vr3,r0,r4 -err3; lvx vr2,r4,r9 -err3; lvx vr1,r4,r10 -err3; lvx vr0,r4,r11 - addi r4,r4,64 -err3; stvx vr3,r0,r3 -err3; stvx vr2,r3,r9 -err3; stvx vr1,r3,r10 -err3; stvx vr0,r3,r11 - addi r3,r3,64 - -7: sub r5,r5,r6 - srdi r6,r5,7 - - std r14,STK_REG(r14)(r1) - std r15,STK_REG(r15)(r1) - std r16,STK_REG(r16)(r1) - - li r12,64 - li r14,80 - li r15,96 - li r16,112 - - mtctr r6 - - /* - * Now do cacheline sized loads and stores. By this stage the - * cacheline stores are also cacheline aligned. - */ - .align 5 -8: -err4; lvx vr7,r0,r4 -err4; lvx vr6,r4,r9 -err4; lvx vr5,r4,r10 -err4; lvx vr4,r4,r11 -err4; lvx vr3,r4,r12 -err4; lvx vr2,r4,r14 -err4; lvx vr1,r4,r15 -err4; lvx vr0,r4,r16 - addi r4,r4,128 -err4; stvx vr7,r0,r3 -err4; stvx vr6,r3,r9 -err4; stvx vr5,r3,r10 -err4; stvx vr4,r3,r11 -err4; stvx vr3,r3,r12 -err4; stvx vr2,r3,r14 -err4; stvx vr1,r3,r15 -err4; stvx vr0,r3,r16 - addi r3,r3,128 - bdnz 8b - - ld r14,STK_REG(r14)(r1) - ld r15,STK_REG(r15)(r1) - ld r16,STK_REG(r16)(r1) - - /* Up to 127B to go */ - clrldi r5,r5,(64-7) - srdi r6,r5,4 - mtocrf 0x01,r6 - - bf cr7*4+1,9f -err3; lvx vr3,r0,r4 -err3; lvx vr2,r4,r9 -err3; lvx vr1,r4,r10 -err3; lvx vr0,r4,r11 - addi r4,r4,64 -err3; stvx vr3,r0,r3 -err3; stvx vr2,r3,r9 -err3; stvx vr1,r3,r10 -err3; stvx vr0,r3,r11 - addi r3,r3,64 - -9: bf cr7*4+2,10f -err3; lvx vr1,r0,r4 -err3; lvx vr0,r4,r9 - addi r4,r4,32 -err3; stvx vr1,r0,r3 -err3; stvx vr0,r3,r9 - addi r3,r3,32 - -10: bf cr7*4+3,11f -err3; lvx vr1,r0,r4 - addi r4,r4,16 -err3; stvx vr1,r0,r3 - addi r3,r3,16 - - /* Up to 15B to go */ -11: clrldi r5,r5,(64-4) - mtocrf 0x01,r5 - bf cr7*4+0,12f -err3; ld r0,0(r4) - addi r4,r4,8 -err3; std r0,0(r3) - addi r3,r3,8 - -12: bf cr7*4+1,13f -err3; lwz r0,0(r4) - addi r4,r4,4 -err3; stw r0,0(r3) - addi r3,r3,4 - -13: bf cr7*4+2,14f -err3; lhz r0,0(r4) - addi r4,r4,2 -err3; sth r0,0(r3) - addi r3,r3,2 - -14: bf cr7*4+3,15f -err3; lbz r0,0(r4) -err3; stb r0,0(r3) - -15: addi r1,r1,STACKFRAMESIZE - b .exit_vmx_copy /* tail call optimise */ - -.Lvmx_unaligned_copy: - /* Get the destination 16B aligned */ - neg r6,r3 - mtocrf 0x01,r6 - clrldi r6,r6,(64-4) - - bf cr7*4+3,1f -err3; lbz r0,0(r4) - addi r4,r4,1 -err3; stb r0,0(r3) - addi r3,r3,1 - -1: bf cr7*4+2,2f -err3; lhz r0,0(r4) - addi r4,r4,2 -err3; sth r0,0(r3) - addi r3,r3,2 - -2: bf cr7*4+1,3f -err3; lwz r0,0(r4) - addi r4,r4,4 -err3; stw r0,0(r3) - addi r3,r3,4 - -3: bf cr7*4+0,4f -err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ -err3; lwz r7,4(r4) - addi r4,r4,8 -err3; stw r0,0(r3) -err3; stw r7,4(r3) - addi r3,r3,8 - -4: sub r5,r5,r6 - - /* Get the desination 128B aligned */ - neg r6,r3 - srdi r7,r6,4 - mtocrf 0x01,r7 - clrldi r6,r6,(64-7) - - li r9,16 - li r10,32 - li r11,48 - - lvsl vr16,0,r4 /* Setup permute control vector */ -err3; lvx vr0,0,r4 - addi r4,r4,16 - - bf cr7*4+3,5f -err3; lvx vr1,r0,r4 - vperm vr8,vr0,vr1,vr16 - addi r4,r4,16 -err3; stvx vr8,r0,r3 - addi r3,r3,16 - vor vr0,vr1,vr1 - -5: bf cr7*4+2,6f -err3; lvx vr1,r0,r4 - vperm vr8,vr0,vr1,vr16 -err3; lvx vr0,r4,r9 - vperm vr9,vr1,vr0,vr16 - addi r4,r4,32 -err3; stvx vr8,r0,r3 -err3; stvx vr9,r3,r9 - addi r3,r3,32 - -6: bf cr7*4+1,7f -err3; lvx vr3,r0,r4 - vperm vr8,vr0,vr3,vr16 -err3; lvx vr2,r4,r9 - vperm vr9,vr3,vr2,vr16 -err3; lvx vr1,r4,r10 - vperm vr10,vr2,vr1,vr16 -err3; lvx vr0,r4,r11 - vperm vr11,vr1,vr0,vr16 - addi r4,r4,64 -err3; stvx vr8,r0,r3 -err3; stvx vr9,r3,r9 -err3; stvx vr10,r3,r10 -err3; stvx vr11,r3,r11 - addi r3,r3,64 - -7: sub r5,r5,r6 - srdi r6,r5,7 - - std r14,STK_REG(r14)(r1) - std r15,STK_REG(r15)(r1) - std r16,STK_REG(r16)(r1) - - li r12,64 - li r14,80 - li r15,96 - li r16,112 - - mtctr r6 - - /* - * Now do cacheline sized loads and stores. By this stage the - * cacheline stores are also cacheline aligned. - */ - .align 5 -8: -err4; lvx vr7,r0,r4 - vperm vr8,vr0,vr7,vr16 -err4; lvx vr6,r4,r9 - vperm vr9,vr7,vr6,vr16 -err4; lvx vr5,r4,r10 - vperm vr10,vr6,vr5,vr16 -err4; lvx vr4,r4,r11 - vperm vr11,vr5,vr4,vr16 -err4; lvx vr3,r4,r12 - vperm vr12,vr4,vr3,vr16 -err4; lvx vr2,r4,r14 - vperm vr13,vr3,vr2,vr16 -err4; lvx vr1,r4,r15 - vperm vr14,vr2,vr1,vr16 -err4; lvx vr0,r4,r16 - vperm vr15,vr1,vr0,vr16 - addi r4,r4,128 -err4; stvx vr8,r0,r3 -err4; stvx vr9,r3,r9 -err4; stvx vr10,r3,r10 -err4; stvx vr11,r3,r11 -err4; stvx vr12,r3,r12 -err4; stvx vr13,r3,r14 -err4; stvx vr14,r3,r15 -err4; stvx vr15,r3,r16 - addi r3,r3,128 - bdnz 8b - - ld r14,STK_REG(r14)(r1) - ld r15,STK_REG(r15)(r1) - ld r16,STK_REG(r16)(r1) - - /* Up to 127B to go */ - clrldi r5,r5,(64-7) - srdi r6,r5,4 - mtocrf 0x01,r6 - - bf cr7*4+1,9f -err3; lvx vr3,r0,r4 - vperm vr8,vr0,vr3,vr16 -err3; lvx vr2,r4,r9 - vperm vr9,vr3,vr2,vr16 -err3; lvx vr1,r4,r10 - vperm vr10,vr2,vr1,vr16 -err3; lvx vr0,r4,r11 - vperm vr11,vr1,vr0,vr16 - addi r4,r4,64 -err3; stvx vr8,r0,r3 -err3; stvx vr9,r3,r9 -err3; stvx vr10,r3,r10 -err3; stvx vr11,r3,r11 - addi r3,r3,64 - -9: bf cr7*4+2,10f -err3; lvx vr1,r0,r4 - vperm vr8,vr0,vr1,vr16 -err3; lvx vr0,r4,r9 - vperm vr9,vr1,vr0,vr16 - addi r4,r4,32 -err3; stvx vr8,r0,r3 -err3; stvx vr9,r3,r9 - addi r3,r3,32 - -10: bf cr7*4+3,11f -err3; lvx vr1,r0,r4 - vperm vr8,vr0,vr1,vr16 - addi r4,r4,16 -err3; stvx vr8,r0,r3 - addi r3,r3,16 - - /* Up to 15B to go */ -11: clrldi r5,r5,(64-4) - addi r4,r4,-16 /* Unwind the +16 load offset */ - mtocrf 0x01,r5 - bf cr7*4+0,12f -err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ -err3; lwz r6,4(r4) - addi r4,r4,8 -err3; stw r0,0(r3) -err3; stw r6,4(r3) - addi r3,r3,8 - -12: bf cr7*4+1,13f -err3; lwz r0,0(r4) - addi r4,r4,4 -err3; stw r0,0(r3) - addi r3,r3,4 - -13: bf cr7*4+2,14f -err3; lhz r0,0(r4) - addi r4,r4,2 -err3; sth r0,0(r3) - addi r3,r3,2 - -14: bf cr7*4+3,15f -err3; lbz r0,0(r4) -err3; stb r0,0(r3) - -15: addi r1,r1,STACKFRAMESIZE - b .exit_vmx_copy /* tail call optimise */ -#endif /* CONFiG_ALTIVEC */ |