diff options
Diffstat (limited to 'mm/madvise.c')
-rw-r--r-- | mm/madvise.c | 451 |
1 files changed, 451 insertions, 0 deletions
diff --git a/mm/madvise.c b/mm/madvise.c new file mode 100644 index 00000000..55f645c8 --- /dev/null +++ b/mm/madvise.c @@ -0,0 +1,451 @@ +/* + * linux/mm/madvise.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 2002 Christoph Hellwig + */ + +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/mempolicy.h> +#include <linux/page-isolation.h> +#include <linux/hugetlb.h> +#include <linux/sched.h> +#include <linux/ksm.h> +#include <linux/file.h> + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) +{ + struct mm_struct * mm = vma->vm_mm; + int error = 0; + pgoff_t pgoff; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTCOPY; + break; + case MADV_DONTDUMP: + new_flags |= VM_NODUMP; + break; + case MADV_DODUMP: + new_flags &= ~VM_NODUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + if (new_flags == vma->vm_flags) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + +success: + /* + * vm_flags is protected by the mmap_sem held in write mode. + */ + vma->vm_flags = new_flags; + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +/* + * Schedule all required I/O operations. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, + unsigned long start, unsigned long end) +{ + struct file *file = vma->vm_file; + + if (!file) + return -EBADF; + + if (file->f_mapping->a_ops->get_xip_mem) { + /* no bad return value, but ignore advice */ + return 0; + } + + *prev = vma; + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + force_page_cache_readahead(file->f_mapping, file, start, end - start); + return 0; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for shrink_active_list to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * shrink_active_list to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + struct vm_area_struct ** prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + struct zap_details details = { + .nonlinear_vma = vma, + .last_index = ULONG_MAX, + }; + zap_page_range(vma, start, end - start, &details); + } else + zap_page_range(vma, start, end - start, NULL); + return 0; +} + +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + * + * NOTE: Currently, only shmfs/tmpfs is supported for this operation. + * Other filesystems return -ENOSYS. + */ +static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct address_space *mapping; + loff_t offset, endoff; + int error; + struct file *f; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) + return -EINVAL; + + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + endoff = (loff_t)(end - vma->vm_start - 1) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + /* + * vmtruncate_range may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); + up_read(¤t->mm->mmap_sem); + error = vmtruncate_range(mapping->host, offset, endoff); + fput(f); + down_read(¤t->mm->mmap_sem); + return error; +} + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages_fast(start, 1, 0, &p); + if (ret != 1) + return ret; + if (bhv == MADV_SOFT_OFFLINE) { + printk(KERN_INFO "Soft offlining page %lx at %lx\n", + page_to_pfn(p), start); + ret = soft_offline_page(p, MF_COUNT_INCREASED); + if (ret) + break; + continue; + } + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + } + return ret; +} +#endif + +static long +madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_DONTNEED: + return madvise_dontneed(vma, prev, start, end); + default: + return madvise_behavior(vma, prev, start, end, behavior); + } +} + +static int +madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif + case MADV_DONTDUMP: + case MADV_DODUMP: + return 1; + + default: + return 0; + } +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + unsigned long end, tmp; + struct vm_area_struct * vma, *prev; + int unmapped_error = 0; + int error = -EINVAL; + int write; + size_t len; + +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_hwpoison(behavior, start, start+len_in); +#endif + if (!madvise_behavior_valid(behavior)) + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + goto out; + + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + goto out; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = madvise_vma(vma, &prev, start, tmp, behavior); + if (error) + goto out; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + goto out; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } +out: + if (write) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + + return error; +} |