diff options
Diffstat (limited to 'arch/x86/lib/memset_64.S')
-rw-r--r-- | arch/x86/lib/memset_64.S | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S new file mode 100644 index 00000000..2dcb3808 --- /dev/null +++ b/arch/x86/lib/memset_64.S @@ -0,0 +1,154 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> + +/* + * ISO C memset - set a memory block to a byte value. This function uses fast + * string to get better performance than the original function. The code is + * simpler and shorter than the orignal function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c: + movq %rdi,%r9 + movq %rdx,%rcx + andl $7,%edx + shrq $3,%rcx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + imulq %rsi,%rax + rep stosq + movl %edx,%ecx + rep stosb + movq %r9,%rax + ret +.Lmemset_e: + .previous + +/* + * ISO C memset - set a memory block to a byte value. This function uses + * enhanced rep stosb to override the fast string function. + * The code is simpler and shorter than the fast string function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c_e: + movq %rdi,%r9 + movb %sil,%al + movq %rdx,%rcx + rep stosb + movq %r9,%rax + ret +.Lmemset_e_e: + .previous + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC + movq %rdi,%r10 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + imulq %rcx,%rax + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment + CFI_REMEMBER_STATE +.Lafter_bad_alignment: + + movq %rdx,%rcx + shrq $6,%rcx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decq %rcx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %edx,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + andl $7,%edx + jz .Lende + .p2align 4 +.Lloop_1: + decl %edx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + + CFI_RESTORE_STATE +.Lbad_alignment: + cmpq $7,%rdx + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%rdx + jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) + + /* Some CPUs support enhanced REP MOVSB/STOSB feature. + * It is recommended to use this when possible. + * + * If enhanced REP MOVSB/STOSB feature is not available, use fast string + * instructions. + * + * Otherwise, use original memset function. + * + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + */ + .section .altinstructions,"a" + altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-memset,.Lmemset_e-.Lmemset_c + altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e + .previous |