diff options
author | Srikant Patnaik | 2015-01-11 12:28:04 +0530 |
---|---|---|
committer | Srikant Patnaik | 2015-01-11 12:28:04 +0530 |
commit | 871480933a1c28f8a9fed4c4d34d06c439a7a422 (patch) | |
tree | 8718f573808810c2a1e8cb8fb6ac469093ca2784 /ANDROID_3.4.5/arch/alpha/lib/ev6-copy_page.S | |
parent | 9d40ac5867b9aefe0722bc1f110b965ff294d30d (diff) | |
download | FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.tar.gz FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.tar.bz2 FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.zip |
Moved, renamed, and deleted files
The original directory structure was scattered and unorganized.
Changes are basically to make it look like kernel structure.
Diffstat (limited to 'ANDROID_3.4.5/arch/alpha/lib/ev6-copy_page.S')
-rw-r--r-- | ANDROID_3.4.5/arch/alpha/lib/ev6-copy_page.S | 203 |
1 files changed, 0 insertions, 203 deletions
diff --git a/ANDROID_3.4.5/arch/alpha/lib/ev6-copy_page.S b/ANDROID_3.4.5/arch/alpha/lib/ev6-copy_page.S deleted file mode 100644 index b789db19..00000000 --- a/ANDROID_3.4.5/arch/alpha/lib/ev6-copy_page.S +++ /dev/null @@ -1,203 +0,0 @@ -/* - * arch/alpha/lib/ev6-copy_page.S - * - * Copy an entire page. - */ - -/* The following comparison of this routine vs the normal copy_page.S - was written by an unnamed ev6 hardware designer and forwarded to me - via Steven Hobbs <hobbs@steven.zko.dec.com>. - - First Problem: STQ overflows. - ----------------------------- - - It would be nice if EV6 handled every resource overflow efficiently, - but for some it doesn't. Including store queue overflows. It causes - a trap and a restart of the pipe. - - To get around this we sometimes use (to borrow a term from a VSSAD - researcher) "aeration". The idea is to slow the rate at which the - processor receives valid instructions by inserting nops in the fetch - path. In doing so, you can prevent the overflow and actually make - the code run faster. You can, of course, take advantage of the fact - that the processor can fetch at most 4 aligned instructions per cycle. - - I inserted enough nops to force it to take 10 cycles to fetch the - loop code. In theory, EV6 should be able to execute this loop in - 9 cycles but I was not able to get it to run that fast -- the initial - conditions were such that I could not reach this optimum rate on - (chaotic) EV6. I wrote the code such that everything would issue - in order. - - Second Problem: Dcache index matches. - ------------------------------------- - - If you are going to use this routine on random aligned pages, there - is a 25% chance that the pages will be at the same dcache indices. - This results in many nasty memory traps without care. - - The solution is to schedule the prefetches to avoid the memory - conflicts. I schedule the wh64 prefetches farther ahead of the - read prefetches to avoid this problem. - - Third Problem: Needs more prefetching. - -------------------------------------- - - In order to improve the code I added deeper prefetching to take the - most advantage of EV6's bandwidth. - - I also prefetched the read stream. Note that adding the read prefetch - forced me to add another cycle to the inner-most kernel - up to 11 - from the original 8 cycles per iteration. We could improve performance - further by unrolling the loop and doing multiple prefetches per cycle. - - I think that the code below will be very robust and fast code for the - purposes of copying aligned pages. It is slower when both source and - destination pages are in the dcache, but it is my guess that this is - less important than the dcache miss case. */ - - - .text - .align 4 - .global copy_page - .ent copy_page -copy_page: - .prologue 0 - - /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ - wh64 ($16) - ldl $31,0($17) - ldl $31,64($17) - lda $1,1*64($16) - - wh64 ($1) - ldl $31,128($17) - ldl $31,192($17) - lda $1,2*64($16) - - wh64 ($1) - ldl $31,256($17) - lda $18,118 - lda $1,3*64($16) - - wh64 ($1) - nop - lda $1,4*64($16) - lda $2,5*64($16) - - wh64 ($1) - wh64 ($2) - lda $1,6*64($16) - lda $2,7*64($16) - - wh64 ($1) - wh64 ($2) - lda $1,8*64($16) - lda $2,9*64($16) - - wh64 ($1) - wh64 ($2) - lda $19,10*64($16) - nop - - /* Main prefetching/write-hinting loop. */ -1: ldq $0,0($17) - ldq $1,8($17) - unop - unop - - unop - unop - ldq $2,16($17) - ldq $3,24($17) - - ldq $4,32($17) - ldq $5,40($17) - unop - unop - - unop - unop - ldq $6,48($17) - ldq $7,56($17) - - ldl $31,320($17) - unop - unop - unop - - /* This gives the extra cycle of aeration above the minimum. */ - unop - unop - unop - unop - - wh64 ($19) - unop - unop - unop - - stq $0,0($16) - subq $18,1,$18 - stq $1,8($16) - unop - - unop - stq $2,16($16) - addq $17,64,$17 - stq $3,24($16) - - stq $4,32($16) - stq $5,40($16) - addq $19,64,$19 - unop - - stq $6,48($16) - stq $7,56($16) - addq $16,64,$16 - bne $18, 1b - - /* Prefetch the final 5 cache lines of the read stream. */ - lda $18,10 - ldl $31,320($17) - ldl $31,384($17) - ldl $31,448($17) - - ldl $31,512($17) - ldl $31,576($17) - nop - nop - - /* Non-prefetching, non-write-hinting cleanup loop for the - final 10 cache lines. */ -2: ldq $0,0($17) - ldq $1,8($17) - ldq $2,16($17) - ldq $3,24($17) - - ldq $4,32($17) - ldq $5,40($17) - ldq $6,48($17) - ldq $7,56($17) - - stq $0,0($16) - subq $18,1,$18 - stq $1,8($16) - addq $17,64,$17 - - stq $2,16($16) - stq $3,24($16) - stq $4,32($16) - stq $5,40($16) - - stq $6,48($16) - stq $7,56($16) - addq $16,64,$16 - bne $18, 2b - - ret - nop - unop - nop - - .end copy_page |