diff options
Diffstat (limited to 'ANDROID_3.4.5/fs/btrfs')
68 files changed, 0 insertions, 77746 deletions
diff --git a/ANDROID_3.4.5/fs/btrfs/Kconfig b/ANDROID_3.4.5/fs/btrfs/Kconfig deleted file mode 100644 index d33f01c0..00000000 --- a/ANDROID_3.4.5/fs/btrfs/Kconfig +++ /dev/null @@ -1,52 +0,0 @@ -config BTRFS_FS - tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" - depends on EXPERIMENTAL - select LIBCRC32C - select ZLIB_INFLATE - select ZLIB_DEFLATE - select LZO_COMPRESS - select LZO_DECOMPRESS - help - Btrfs is a new filesystem with extents, writable snapshotting, - support for multiple devices and many more features. - - Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET - FINALIZED. You should say N here unless you are interested in - testing Btrfs with non-critical data. - - To compile this file system support as a module, choose M here. The - module will be called btrfs. - - If unsure, say N. - -config BTRFS_FS_POSIX_ACL - bool "Btrfs POSIX Access Control Lists" - depends on BTRFS_FS - select FS_POSIX_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DANGEROUS)" - depends on BTRFS_FS - help - Adds code that examines all block write requests (including - writes of the super block). The goal is to verify that the - state of the filesystem on disk is always consistent, i.e., - after a power-loss or kernel panic event the filesystem is - in a consistent state. - - If the integrity check tool is included and activated in - the mount options, plenty of kernel memory is used, and - plenty of additional CPU cycles are spent. Enabling this - functionality is not intended for normal use. - - In most cases, unless you are a btrfs developer who needs - to verify the integrity of (super)-block write requests - during the run of a regression test, say N diff --git a/ANDROID_3.4.5/fs/btrfs/Makefile b/ANDROID_3.4.5/fs/btrfs/Makefile deleted file mode 100644 index 0c4fa2be..00000000 --- a/ANDROID_3.4.5/fs/btrfs/Makefile +++ /dev/null @@ -1,14 +0,0 @@ - -obj-$(CONFIG_BTRFS_FS) := btrfs.o - -btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ - file-item.o inode-item.o inode-map.o disk-io.o \ - transaction.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o - -btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/ANDROID_3.4.5/fs/btrfs/acl.c b/ANDROID_3.4.5/fs/btrfs/acl.c deleted file mode 100644 index 89b156d8..00000000 --- a/ANDROID_3.4.5/fs/btrfs/acl.c +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/fs.h> -#include <linux/string.h> -#include <linux/xattr.h> -#include <linux/posix_acl_xattr.h> -#include <linux/posix_acl.h> -#include <linux/sched.h> -#include <linux/slab.h> - -#include "ctree.h" -#include "btrfs_inode.h" -#include "xattr.h" - -struct posix_acl *btrfs_get_acl(struct inode *inode, int type) -{ - int size; - const char *name; - char *value = NULL; - struct posix_acl *acl; - - if (!IS_POSIXACL(inode)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - - switch (type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - BUG(); - } - - size = __btrfs_getxattr(inode, name, "", 0); - if (size > 0) { - value = kzalloc(size, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - size = __btrfs_getxattr(inode, name, value, size); - } - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - } else if (size == -ENOENT || size == -ENODATA || size == 0) { - /* FIXME, who returns -ENOENT? I think nobody */ - acl = NULL; - } else { - acl = ERR_PTR(-EIO); - } - kfree(value); - - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - - return acl; -} - -static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int ret = 0; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - acl = btrfs_get_acl(dentry->d_inode, type); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(acl, value, size); - posix_acl_release(acl); - - return ret; -} - -/* - * Needs to be called with fs_mutex held - */ -static int btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, int type) -{ - int ret, size = 0; - const char *name; - char *value = NULL; - - if (acl) { - ret = posix_acl_valid(acl); - if (ret < 0) - return ret; - ret = 0; - } - - switch (type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - if (acl) { - ret = posix_acl_equiv_mode(acl, &inode->i_mode); - if (ret < 0) - return ret; - } - ret = 0; - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) - return acl ? -EINVAL : 0; - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return -EINVAL; - } - - if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); - if (!value) { - ret = -ENOMEM; - goto out; - } - - ret = posix_acl_to_xattr(acl, value, size); - if (ret < 0) - goto out; - } - - ret = __btrfs_setxattr(trans, inode, name, value, size, 0); -out: - kfree(value); - - if (!ret) - set_cached_acl(inode, type, acl); - - return ret; -} - -static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - int ret; - struct posix_acl *acl = NULL; - - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - if (value) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } - } - - ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); -out: - posix_acl_release(acl); - - return ret; -} - -/* - * btrfs_init_acl is already generally called under fs_mutex, so the locking - * stuff has been fixed to work with that. If the locking stuff changes, we - * need to re-evaluate the acl locking stuff. - */ -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int ret = 0; - - /* this happens with subvols */ - if (!dir) - return 0; - - if (!S_ISLNK(inode->i_mode)) { - if (IS_POSIXACL(dir)) { - acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - - if (!acl) - inode->i_mode &= ~current_umask(); - } - - if (IS_POSIXACL(dir) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(trans, inode, acl, - ACL_TYPE_DEFAULT); - if (ret) - goto failed; - } - ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (ret < 0) - return ret; - - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); - } - } -failed: - posix_acl_release(acl); - - return ret; -} - -int btrfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int ret = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!IS_POSIXACL(inode)) - return 0; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) - return PTR_ERR(acl); - - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); - return ret; -} - -const struct xattr_handler btrfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; - -const struct xattr_handler btrfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; diff --git a/ANDROID_3.4.5/fs/btrfs/async-thread.c b/ANDROID_3.4.5/fs/btrfs/async-thread.c deleted file mode 100644 index 42704149..00000000 --- a/ANDROID_3.4.5/fs/btrfs/async-thread.c +++ /dev/null @@ -1,707 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/kthread.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/freezer.h> -#include "async-thread.h" - -#define WORK_QUEUED_BIT 0 -#define WORK_DONE_BIT 1 -#define WORK_ORDER_DONE_BIT 2 -#define WORK_HIGH_PRIO_BIT 3 - -/* - * container for the kthread task pointer and the list of pending work - * One of these is allocated per thread. - */ -struct btrfs_worker_thread { - /* pool we belong to */ - struct btrfs_workers *workers; - - /* list of struct btrfs_work that are waiting for service */ - struct list_head pending; - struct list_head prio_pending; - - /* list of worker threads from struct btrfs_workers */ - struct list_head worker_list; - - /* kthread */ - struct task_struct *task; - - /* number of things on the pending list */ - atomic_t num_pending; - - /* reference counter for this struct */ - atomic_t refs; - - unsigned long sequence; - - /* protects the pending list. */ - spinlock_t lock; - - /* set to non-zero when this thread is already awake and kicking */ - int working; - - /* are we currently idle */ - int idle; -}; - -static int __btrfs_start_workers(struct btrfs_workers *workers); - -/* - * btrfs_start_workers uses kthread_run, which can block waiting for memory - * for a very long time. It will actually throttle on page writeback, - * and so it may not make progress until after our btrfs worker threads - * process all of the pending work structs in their queue - * - * This means we can't use btrfs_start_workers from inside a btrfs worker - * thread that is used as part of cleaning dirty memory, which pretty much - * involves all of the worker threads. - * - * Instead we have a helper queue who never has more than one thread - * where we scheduler thread start operations. This worker_start struct - * is used to contain the work and hold a pointer to the queue that needs - * another worker. - */ -struct worker_start { - struct btrfs_work work; - struct btrfs_workers *queue; -}; - -static void start_new_worker_func(struct btrfs_work *work) -{ - struct worker_start *start; - start = container_of(work, struct worker_start, work); - __btrfs_start_workers(start->queue); - kfree(start); -} - -/* - * helper function to move a thread onto the idle list after it - * has finished some requests. - */ -static void check_idle_worker(struct btrfs_worker_thread *worker) -{ - if (!worker->idle && atomic_read(&worker->num_pending) < - worker->workers->idle_thresh / 2) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 1; - - /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list)) { - list_move(&worker->worker_list, - &worker->workers->idle_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } -} - -/* - * helper function to move a thread off the idle list after new - * pending work is added. - */ -static void check_busy_worker(struct btrfs_worker_thread *worker) -{ - if (worker->idle && atomic_read(&worker->num_pending) >= - worker->workers->idle_thresh) { - unsigned long flags; - spin_lock_irqsave(&worker->workers->lock, flags); - worker->idle = 0; - - if (!list_empty(&worker->worker_list)) { - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - } - spin_unlock_irqrestore(&worker->workers->lock, flags); - } -} - -static void check_pending_worker_creates(struct btrfs_worker_thread *worker) -{ - struct btrfs_workers *workers = worker->workers; - struct worker_start *start; - unsigned long flags; - - rmb(); - if (!workers->atomic_start_pending) - return; - - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) - return; - - start->work.func = start_new_worker_func; - start->queue = workers; - - spin_lock_irqsave(&workers->lock, flags); - if (!workers->atomic_start_pending) - goto out; - - workers->atomic_start_pending = 0; - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) - goto out; - - workers->num_workers_starting += 1; - spin_unlock_irqrestore(&workers->lock, flags); - btrfs_queue_worker(workers->atomic_worker_start, &start->work); - return; - -out: - kfree(start); - spin_unlock_irqrestore(&workers->lock, flags); -} - -static noinline void run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) -{ - if (!workers->ordered) - return; - - set_bit(WORK_DONE_BIT, &work->flags); - - spin_lock(&workers->order_lock); - - while (1) { - if (!list_empty(&workers->prio_order_list)) { - work = list_entry(workers->prio_order_list.next, - struct btrfs_work, order_list); - } else if (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - } else { - break; - } - if (!test_bit(WORK_DONE_BIT, &work->flags)) - break; - - /* we are going to call the ordered done function, but - * we leave the work item on the list as a barrier so - * that later work items that are done don't have their - * functions called before this one returns - */ - if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) - break; - - spin_unlock(&workers->order_lock); - - work->ordered_func(work); - - /* now take the lock again and call the freeing code */ - spin_lock(&workers->order_lock); - list_del(&work->order_list); - work->ordered_free(work); - } - - spin_unlock(&workers->order_lock); -} - -static void put_worker(struct btrfs_worker_thread *worker) -{ - if (atomic_dec_and_test(&worker->refs)) - kfree(worker); -} - -static int try_worker_shutdown(struct btrfs_worker_thread *worker) -{ - int freeit = 0; - - spin_lock_irq(&worker->lock); - spin_lock(&worker->workers->lock); - if (worker->workers->num_workers > 1 && - worker->idle && - !worker->working && - !list_empty(&worker->worker_list) && - list_empty(&worker->prio_pending) && - list_empty(&worker->pending) && - atomic_read(&worker->num_pending) == 0) { - freeit = 1; - list_del_init(&worker->worker_list); - worker->workers->num_workers--; - } - spin_unlock(&worker->workers->lock); - spin_unlock_irq(&worker->lock); - - if (freeit) - put_worker(worker); - return freeit; -} - -static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, - struct list_head *prio_head, - struct list_head *head) -{ - struct btrfs_work *work = NULL; - struct list_head *cur = NULL; - - if(!list_empty(prio_head)) - cur = prio_head->next; - - smp_mb(); - if (!list_empty(&worker->prio_pending)) - goto refill; - - if (!list_empty(head)) - cur = head->next; - - if (cur) - goto out; - -refill: - spin_lock_irq(&worker->lock); - list_splice_tail_init(&worker->prio_pending, prio_head); - list_splice_tail_init(&worker->pending, head); - - if (!list_empty(prio_head)) - cur = prio_head->next; - else if (!list_empty(head)) - cur = head->next; - spin_unlock_irq(&worker->lock); - - if (!cur) - goto out_fail; - -out: - work = list_entry(cur, struct btrfs_work, list); - -out_fail: - return work; -} - -/* - * main loop for servicing work items - */ -static int worker_loop(void *arg) -{ - struct btrfs_worker_thread *worker = arg; - struct list_head head; - struct list_head prio_head; - struct btrfs_work *work; - - INIT_LIST_HEAD(&head); - INIT_LIST_HEAD(&prio_head); - - do { -again: - while (1) { - - - work = get_next_work(worker, &prio_head, &head); - if (!work) - break; - - list_del(&work->list); - clear_bit(WORK_QUEUED_BIT, &work->flags); - - work->worker = worker; - - work->func(work); - - atomic_dec(&worker->num_pending); - /* - * unless this is an ordered work queue, - * 'work' was probably freed by func above. - */ - run_ordered_completions(worker->workers, work); - - check_pending_worker_creates(worker); - cond_resched(); - } - - spin_lock_irq(&worker->lock); - check_idle_worker(worker); - - if (freezing(current)) { - worker->working = 0; - spin_unlock_irq(&worker->lock); - try_to_freeze(); - } else { - spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) { - cpu_relax(); - /* - * we've dropped the lock, did someone else - * jump_in? - */ - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - /* - * this short schedule allows more work to - * come in without the queue functions - * needing to go through wake_up_process() - * - * worker->working is still 1, so nobody - * is going to try and wake us up - */ - schedule_timeout(1); - smp_mb(); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - continue; - - if (kthread_should_stop()) - break; - - /* still no more work?, sleep for real */ - spin_lock_irq(&worker->lock); - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) { - spin_unlock_irq(&worker->lock); - set_current_state(TASK_RUNNING); - goto again; - } - - /* - * this makes sure we get a wakeup when someone - * adds something new to the queue - */ - worker->working = 0; - spin_unlock_irq(&worker->lock); - - if (!kthread_should_stop()) { - schedule_timeout(HZ * 120); - if (!worker->working && - try_worker_shutdown(worker)) { - return 0; - } - } - } - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -/* - * this will wait for all the worker threads to shutdown - */ -void btrfs_stop_workers(struct btrfs_workers *workers) -{ - struct list_head *cur; - struct btrfs_worker_thread *worker; - int can_stop; - - spin_lock_irq(&workers->lock); - list_splice_init(&workers->idle_list, &workers->worker_list); - while (!list_empty(&workers->worker_list)) { - cur = workers->worker_list.next; - worker = list_entry(cur, struct btrfs_worker_thread, - worker_list); - - atomic_inc(&worker->refs); - workers->num_workers -= 1; - if (!list_empty(&worker->worker_list)) { - list_del_init(&worker->worker_list); - put_worker(worker); - can_stop = 1; - } else - can_stop = 0; - spin_unlock_irq(&workers->lock); - if (can_stop) - kthread_stop(worker->task); - spin_lock_irq(&workers->lock); - put_worker(worker); - } - spin_unlock_irq(&workers->lock); -} - -/* - * simple init on struct btrfs_workers - */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_helper) -{ - workers->num_workers = 0; - workers->num_workers_starting = 0; - INIT_LIST_HEAD(&workers->worker_list); - INIT_LIST_HEAD(&workers->idle_list); - INIT_LIST_HEAD(&workers->order_list); - INIT_LIST_HEAD(&workers->prio_order_list); - spin_lock_init(&workers->lock); - spin_lock_init(&workers->order_lock); - workers->max_workers = max; - workers->idle_thresh = 32; - workers->name = name; - workers->ordered = 0; - workers->atomic_start_pending = 0; - workers->atomic_worker_start = async_helper; -} - -/* - * starts new worker threads. This does not enforce the max worker - * count in case you need to temporarily go past it. - */ -static int __btrfs_start_workers(struct btrfs_workers *workers) -{ - struct btrfs_worker_thread *worker; - int ret = 0; - - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } - - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + 1); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - kfree(worker); - goto fail; - } - spin_lock_irq(&workers->lock); - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); - - return 0; -fail: - spin_lock_irq(&workers->lock); - workers->num_workers_starting--; - spin_unlock_irq(&workers->lock); - return ret; -} - -int btrfs_start_workers(struct btrfs_workers *workers) -{ - spin_lock_irq(&workers->lock); - workers->num_workers_starting++; - spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers); -} - -/* - * run through the list and find a worker thread that doesn't have a lot - * to do right now. This can return null if we aren't yet at the thread - * count limit and all of the threads are busy. - */ -static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) -{ - struct btrfs_worker_thread *worker; - struct list_head *next; - int enforce_min; - - enforce_min = (workers->num_workers + workers->num_workers_starting) < - workers->max_workers; - - /* - * if we find an idle thread, don't move it to the end of the - * idle list. This improves the chance that the next submission - * will reuse the same thread, and maybe catch it while it is still - * working - */ - if (!list_empty(&workers->idle_list)) { - next = workers->idle_list.next; - worker = list_entry(next, struct btrfs_worker_thread, - worker_list); - return worker; - } - if (enforce_min || list_empty(&workers->worker_list)) - return NULL; - - /* - * if we pick a busy task, move the task to the end of the list. - * hopefully this will keep things somewhat evenly balanced. - * Do the move in batches based on the sequence number. This groups - * requests submitted at roughly the same time onto the same worker. - */ - next = workers->worker_list.next; - worker = list_entry(next, struct btrfs_worker_thread, worker_list); - worker->sequence++; - - if (worker->sequence % workers->idle_thresh == 0) - list_move_tail(next, &workers->worker_list); - return worker; -} - -/* - * selects a worker thread to take the next job. This will either find - * an idle worker, start a new worker up to the max count, or just return - * one of the existing busy workers. - */ -static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) -{ - struct btrfs_worker_thread *worker; - unsigned long flags; - struct list_head *fallback; - int ret; - - spin_lock_irqsave(&workers->lock, flags); -again: - worker = next_worker(workers); - - if (!worker) { - if (workers->num_workers + workers->num_workers_starting >= - workers->max_workers) { - goto fallback; - } else if (workers->atomic_worker_start) { - workers->atomic_start_pending = 1; - goto fallback; - } else { - workers->num_workers_starting++; - spin_unlock_irqrestore(&workers->lock, flags); - /* we're below the limit, start another worker */ - ret = __btrfs_start_workers(workers); - spin_lock_irqsave(&workers->lock, flags); - if (ret) - goto fallback; - goto again; - } - } - goto found; - -fallback: - fallback = NULL; - /* - * we have failed to find any workers, just - * return the first one we can find. - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); -found: - /* - * this makes sure the worker doesn't exit before it is placed - * onto a busy/idle list - */ - atomic_inc(&worker->num_pending); - spin_unlock_irqrestore(&workers->lock, flags); - return worker; -} - -/* - * btrfs_requeue_work just puts the work item back on the tail of the list - * it was taken from. It is intended for use with long running work functions - * that make some progress and want to give the cpu up for others. - */ -void btrfs_requeue_work(struct btrfs_work *work) -{ - struct btrfs_worker_thread *worker = work->worker; - unsigned long flags; - int wake = 0; - - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - return; - - spin_lock_irqsave(&worker->lock, flags); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); - - /* by definition we're busy, take ourselves off the idle - * list - */ - if (worker->idle) { - spin_lock(&worker->workers->lock); - worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); - spin_unlock(&worker->workers->lock); - } - if (!worker->working) { - wake = 1; - worker->working = 1; - } - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); -} - -void btrfs_set_work_high_prio(struct btrfs_work *work) -{ - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); -} - -/* - * places a struct btrfs_work into the pending queue of one of the kthreads - */ -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) -{ - struct btrfs_worker_thread *worker; - unsigned long flags; - int wake = 0; - - /* don't requeue something already on a list */ - if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - return; - - worker = find_worker(workers); - if (workers->ordered) { - /* - * you're not allowed to do ordered queues from an - * interrupt handler - */ - spin_lock(&workers->order_lock); - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { - list_add_tail(&work->order_list, - &workers->prio_order_list); - } else { - list_add_tail(&work->order_list, &workers->order_list); - } - spin_unlock(&workers->order_lock); - } else { - INIT_LIST_HEAD(&work->order_list); - } - - spin_lock_irqsave(&worker->lock, flags); - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) - list_add_tail(&work->list, &worker->prio_pending); - else - list_add_tail(&work->list, &worker->pending); - check_busy_worker(worker); - - /* - * avoid calling into wake_up_process if this thread has already - * been kicked - */ - if (!worker->working) - wake = 1; - worker->working = 1; - - if (wake) - wake_up_process(worker->task); - spin_unlock_irqrestore(&worker->lock, flags); -} diff --git a/ANDROID_3.4.5/fs/btrfs/async-thread.h b/ANDROID_3.4.5/fs/btrfs/async-thread.h deleted file mode 100644 index 063698b9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/async-thread.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_ASYNC_THREAD_ -#define __BTRFS_ASYNC_THREAD_ - -struct btrfs_worker_thread; - -/* - * This is similar to a workqueue, but it is meant to spread the operations - * across all available cpus instead of just the CPU that was used to - * queue the work. There is also some batching introduced to try and - * cut down on context switches. - * - * By default threads are added on demand up to 2 * the number of cpus. - * Changing struct btrfs_workers->max_workers is one way to prevent - * demand creation of kthreads. - * - * the basic model of these worker threads is to embed a btrfs_work - * structure in your own data struct, and use container_of in a - * work function to get back to your data struct. - */ -struct btrfs_work { - /* - * func should be set to the function you want called - * your work struct is passed as the only arg - * - * ordered_func must be set for work sent to an ordered work queue, - * and it is called to complete a given work item in the same - * order they were sent to the queue. - */ - void (*func)(struct btrfs_work *work); - void (*ordered_func)(struct btrfs_work *work); - void (*ordered_free)(struct btrfs_work *work); - - /* - * flags should be set to zero. It is used to make sure the - * struct is only inserted once into the list. - */ - unsigned long flags; - - /* don't touch these */ - struct btrfs_worker_thread *worker; - struct list_head list; - struct list_head order_list; -}; - -struct btrfs_workers { - /* current number of running workers */ - int num_workers; - - int num_workers_starting; - - /* max number of workers allowed. changed by btrfs_start_workers */ - int max_workers; - - /* once a worker has this many requests or fewer, it is idle */ - int idle_thresh; - - /* force completions in the order they were queued */ - int ordered; - - /* more workers required, but in an interrupt handler */ - int atomic_start_pending; - - /* - * are we allowed to sleep while starting workers or are we required - * to start them at a later time? If we can't sleep, this indicates - * which queue we need to use to schedule thread creation. - */ - struct btrfs_workers *atomic_worker_start; - - /* list with all the work threads. The workers on the idle thread - * may be actively servicing jobs, but they haven't yet hit the - * idle thresh limit above. - */ - struct list_head worker_list; - struct list_head idle_list; - - /* - * when operating in ordered mode, this maintains the list - * of work items waiting for completion - */ - struct list_head order_list; - struct list_head prio_order_list; - - /* lock for finding the next worker thread to queue on */ - spinlock_t lock; - - /* lock for the ordered lists */ - spinlock_t order_lock; - - /* extra name for this worker, used for current->name */ - char *name; -}; - -void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers); -void btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, - struct btrfs_workers *async_starter); -void btrfs_requeue_work(struct btrfs_work *work); -void btrfs_set_work_high_prio(struct btrfs_work *work); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/backref.c b/ANDROID_3.4.5/fs/btrfs/backref.c deleted file mode 100644 index bcec0675..00000000 --- a/ANDROID_3.4.5/fs/btrfs/backref.c +++ /dev/null @@ -1,1432 +0,0 @@ -/* - * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "ctree.h" -#include "disk-io.h" -#include "backref.h" -#include "ulist.h" -#include "transaction.h" -#include "delayed-ref.h" -#include "locking.h" - -/* - * this structure records all encountered refs on the way up to the root - */ -struct __prelim_ref { - struct list_head list; - u64 root_id; - struct btrfs_key key; - int level; - int count; - u64 parent; - u64 wanted_disk_byte; -}; - -static int __add_prelim_ref(struct list_head *head, u64 root_id, - struct btrfs_key *key, int level, u64 parent, - u64 wanted_disk_byte, int count) -{ - struct __prelim_ref *ref; - - /* in case we're adding delayed refs, we're holding the refs spinlock */ - ref = kmalloc(sizeof(*ref), GFP_ATOMIC); - if (!ref) - return -ENOMEM; - - ref->root_id = root_id; - if (key) - ref->key = *key; - else - memset(&ref->key, 0, sizeof(ref->key)); - - ref->level = level; - ref->count = count; - ref->parent = parent; - ref->wanted_disk_byte = wanted_disk_byte; - list_add_tail(&ref->list, head); - - return 0; -} - -static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, - struct extent_buffer *eb, int level, - u64 wanted_objectid, u64 wanted_disk_byte) -{ - int ret; - int slot; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - u64 disk_byte; - -add_parent: - ret = ulist_add(parents, eb->start, 0, GFP_NOFS); - if (ret < 0) - return ret; - - if (level != 0) - return 0; - - /* - * if the current leaf is full with EXTENT_DATA items, we must - * check the next one if that holds a reference as well. - * ref->count cannot be used to skip this check. - * repeat this until we don't find any additional EXTENT_DATA items. - */ - while (1) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - if (ret) - return 0; - - eb = path->nodes[0]; - for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != wanted_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) - return 0; - fi = btrfs_item_ptr(eb, slot, - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == wanted_disk_byte) - goto add_parent; - } - } - - return 0; -} - -/* - * resolve an indirect backref in the form (root_id, key, level) - * to a logical address - */ -static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, - int search_commit_root, - struct __prelim_ref *ref, - struct ulist *parents) -{ - struct btrfs_path *path; - struct btrfs_root *root; - struct btrfs_key root_key; - struct btrfs_key key = {0}; - struct extent_buffer *eb; - int ret = 0; - int root_level; - int level = ref->level; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->search_commit_root = !!search_commit_root; - - root_key.objectid = ref->root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } - - rcu_read_lock(); - root_level = btrfs_header_level(root->node); - rcu_read_unlock(); - - if (root_level + 1 == level) - goto out; - - path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0); - pr_debug("search slot in root %llu (level %d, ref count %d) returned " - "%d for key (%llu %u %llu)\n", - (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key.objectid, ref->key.type, - (unsigned long long)ref->key.offset); - if (ret < 0) - goto out; - - eb = path->nodes[level]; - if (!eb) { - WARN_ON(1); - ret = 1; - goto out; - } - - if (level == 0) { - if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - } - - /* the last two parameters will only be used for level == 0 */ - ret = add_all_parents(root, path, parents, eb, level, key.objectid, - ref->wanted_disk_byte); -out: - btrfs_free_path(path); - return ret; -} - -/* - * resolve all indirect backrefs from the list - */ -static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - int search_commit_root, - struct list_head *head) -{ - int err; - int ret = 0; - struct __prelim_ref *ref; - struct __prelim_ref *ref_safe; - struct __prelim_ref *new_ref; - struct ulist *parents; - struct ulist_node *node; - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - /* - * _safe allows us to insert directly after the current item without - * iterating over the newly inserted items. - * we're also allowed to re-assign ref during iteration. - */ - list_for_each_entry_safe(ref, ref_safe, head, list) { - if (ref->parent) /* already direct */ - continue; - if (ref->count == 0) - continue; - err = __resolve_indirect_ref(fs_info, search_commit_root, - ref, parents); - if (err) { - if (ret == 0) - ret = err; - continue; - } - - /* we put the first parent into the ref at hand */ - node = ulist_next(parents, NULL); - ref->parent = node ? node->val : 0; - - /* additional parents require new refs being added here */ - while ((node = ulist_next(parents, node))) { - new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); - if (!new_ref) { - ret = -ENOMEM; - break; - } - memcpy(new_ref, ref, sizeof(*ref)); - new_ref->parent = node->val; - list_add(&new_ref->list, &ref->list); - } - ulist_reinit(parents); - } - - ulist_free(parents); - return ret; -} - -/* - * merge two lists of backrefs and adjust counts accordingly - * - * mode = 1: merge identical keys, if key is set - * mode = 2: merge identical parents - */ -static int __merge_refs(struct list_head *head, int mode) -{ - struct list_head *pos1; - - list_for_each(pos1, head) { - struct list_head *n2; - struct list_head *pos2; - struct __prelim_ref *ref1; - - ref1 = list_entry(pos1, struct __prelim_ref, list); - - if (mode == 1 && ref1->key.type == 0) - continue; - for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; - pos2 = n2, n2 = pos2->next) { - struct __prelim_ref *ref2; - - ref2 = list_entry(pos2, struct __prelim_ref, list); - - if (mode == 1) { - if (memcmp(&ref1->key, &ref2->key, - sizeof(ref1->key)) || - ref1->level != ref2->level || - ref1->root_id != ref2->root_id) - continue; - ref1->count += ref2->count; - } else { - if (ref1->parent != ref2->parent) - continue; - ref1->count += ref2->count; - } - list_del(&ref2->list); - kfree(ref2); - } - - } - return 0; -} - -/* - * add all currently queued delayed refs from this head whose seq nr is - * smaller or equal that seq to the list - */ -static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct btrfs_key *info_key, - struct list_head *prefs) -{ - struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; - int sgn; - int ret = 0; - - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(info_key, &extent_op->key); - - while ((n = rb_prev(n))) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - if (node->bytenr != head->node.bytenr) - break; - WARN_ON(node->is_head); - - if (node->seq > seq) - continue; - - switch (node->action) { - case BTRFS_ADD_DELAYED_EXTENT: - case BTRFS_UPDATE_DELAYED_HEAD: - WARN_ON(1); - continue; - case BTRFS_ADD_DELAYED_REF: - sgn = 1; - break; - case BTRFS_DROP_DELAYED_REF: - sgn = -1; - break; - default: - BUG_ON(1); - } - switch (node->type) { - case BTRFS_TREE_BLOCK_REF_KEY: { - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, - ref->level + 1, 0, node->bytenr, - node->ref_mod * sgn); - break; - } - case BTRFS_SHARED_BLOCK_REF_KEY: { - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, info_key, - ref->level + 1, ref->parent, - node->bytenr, - node->ref_mod * sgn); - break; - } - case BTRFS_EXTENT_DATA_REF_KEY: { - struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, - node->bytenr, - node->ref_mod * sgn); - break; - } - case BTRFS_SHARED_DATA_REF_KEY: { - struct btrfs_delayed_data_ref *ref; - struct btrfs_key key; - - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, - ref->parent, node->bytenr, - node->ref_mod * sgn); - break; - } - default: - WARN_ON(1); - } - BUG_ON(ret); - } - - return 0; -} - -/* - * add all inline backrefs for bytenr to the list - */ -static int __add_inline_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int *info_level, - struct list_head *prefs) -{ - int ret = 0; - int slot; - struct extent_buffer *leaf; - struct btrfs_key key; - unsigned long ptr; - unsigned long end; - struct btrfs_extent_item *ei; - u64 flags; - u64 item_size; - - /* - * enumerate all inline refs - */ - leaf = path->nodes[0]; - slot = path->slots[0] - 1; - - item_size = btrfs_item_size_nr(leaf, slot); - BUG_ON(item_size < sizeof(*ei)); - - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); - flags = btrfs_extent_flags(leaf, ei); - - ptr = (unsigned long)(ei + 1); - end = (unsigned long)ei + item_size; - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - struct btrfs_tree_block_info *info; - struct btrfs_disk_key disk_key; - - info = (struct btrfs_tree_block_info *)ptr; - *info_level = btrfs_tree_block_level(leaf, info); - btrfs_tree_block_key(leaf, info, &disk_key); - btrfs_disk_key_to_cpu(info_key, &disk_key); - ptr += sizeof(struct btrfs_tree_block_info); - BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); - } - - while (ptr < end) { - struct btrfs_extent_inline_ref *iref; - u64 offset; - int type; - - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(leaf, iref); - offset = btrfs_extent_inline_ref_offset(leaf, iref); - - switch (type) { - case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, - *info_level + 1, offset, - bytenr, 1); - break; - case BTRFS_SHARED_DATA_REF_KEY: { - struct btrfs_shared_data_ref *sdref; - int count; - - sdref = (struct btrfs_shared_data_ref *)(iref + 1); - count = btrfs_shared_data_ref_count(leaf, sdref); - ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, - bytenr, count); - break; - } - case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, offset, info_key, - *info_level + 1, 0, bytenr, 1); - break; - case BTRFS_EXTENT_DATA_REF_KEY: { - struct btrfs_extent_data_ref *dref; - int count; - u64 root; - - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - count = btrfs_extent_data_ref_count(leaf, dref); - key.objectid = btrfs_extent_data_ref_objectid(leaf, - dref); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = btrfs_extent_data_ref_offset(leaf, dref); - root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, - count); - break; - } - default: - WARN_ON(1); - } - BUG_ON(ret); - ptr += btrfs_extent_inline_ref_size(type); - } - - return 0; -} - -/* - * add all non-inline backrefs for bytenr to the list - */ -static int __add_keyed_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, - struct btrfs_key *info_key, int info_level, - struct list_head *prefs) -{ - struct btrfs_root *extent_root = fs_info->extent_root; - int ret; - int slot; - struct extent_buffer *leaf; - struct btrfs_key key; - - while (1) { - ret = btrfs_next_item(extent_root, path); - if (ret < 0) - break; - if (ret) { - ret = 0; - break; - } - - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.objectid != bytenr) - break; - if (key.type < BTRFS_TREE_BLOCK_REF_KEY) - continue; - if (key.type > BTRFS_SHARED_DATA_REF_KEY) - break; - - switch (key.type) { - case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, info_key, - info_level + 1, key.offset, - bytenr, 1); - break; - case BTRFS_SHARED_DATA_REF_KEY: { - struct btrfs_shared_data_ref *sdref; - int count; - - sdref = btrfs_item_ptr(leaf, slot, - struct btrfs_shared_data_ref); - count = btrfs_shared_data_ref_count(leaf, sdref); - ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, - bytenr, count); - break; - } - case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, key.offset, info_key, - info_level + 1, 0, bytenr, 1); - break; - case BTRFS_EXTENT_DATA_REF_KEY: { - struct btrfs_extent_data_ref *dref; - int count; - u64 root; - - dref = btrfs_item_ptr(leaf, slot, - struct btrfs_extent_data_ref); - count = btrfs_extent_data_ref_count(leaf, dref); - key.objectid = btrfs_extent_data_ref_objectid(leaf, - dref); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = btrfs_extent_data_ref_offset(leaf, dref); - root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); - break; - } - default: - WARN_ON(1); - } - BUG_ON(ret); - } - - return ret; -} - -/* - * this adds all existing backrefs (inline backrefs, backrefs and delayed - * refs) for the given bytenr to the refs list, merges duplicates and resolves - * indirect refs to their parent bytenr. - * When roots are found, they're added to the roots list - * - * FIXME some caching might speed things up - */ -static int find_parent_nodes(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 seq, struct ulist *refs, struct ulist *roots) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_key info_key = { 0 }; - struct btrfs_delayed_ref_root *delayed_refs = NULL; - struct btrfs_delayed_ref_head *head; - int info_level = 0; - int ret; - int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT); - struct list_head prefs_delayed; - struct list_head prefs; - struct __prelim_ref *ref; - - INIT_LIST_HEAD(&prefs); - INIT_LIST_HEAD(&prefs_delayed); - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->search_commit_root = !!search_commit_root; - - /* - * grab both a lock on the path and a lock on the delayed ref head. - * We need both to get a consistent picture of how the refs look - * at a specified point in time - */ -again: - head = NULL; - - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); - - if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { - /* - * look if there are updates for this ref queued and lock the - * head - */ - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (head) { - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - goto again; - } - ret = __add_delayed_refs(head, seq, &info_key, - &prefs_delayed); - if (ret) { - spin_unlock(&delayed_refs->lock); - goto out; - } - } - spin_unlock(&delayed_refs->lock); - } - - if (path->slots[0]) { - struct extent_buffer *leaf; - int slot; - - leaf = path->nodes[0]; - slot = path->slots[0] - 1; - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid == bytenr && - key.type == BTRFS_EXTENT_ITEM_KEY) { - ret = __add_inline_refs(fs_info, path, bytenr, - &info_key, &info_level, &prefs); - if (ret) - goto out; - ret = __add_keyed_refs(fs_info, path, bytenr, &info_key, - info_level, &prefs); - if (ret) - goto out; - } - } - btrfs_release_path(path); - - /* - * when adding the delayed refs above, the info_key might not have - * been known yet. Go over the list and replace the missing keys - */ - list_for_each_entry(ref, &prefs_delayed, list) { - if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0) - memcpy(&ref->key, &info_key, sizeof(ref->key)); - } - list_splice_init(&prefs_delayed, &prefs); - - ret = __merge_refs(&prefs, 1); - if (ret) - goto out; - - ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs); - if (ret) - goto out; - - ret = __merge_refs(&prefs, 2); - if (ret) - goto out; - - while (!list_empty(&prefs)) { - ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); - if (ref->count < 0) - WARN_ON(1); - if (ref->count && ref->root_id && ref->parent == 0) { - /* no parent == root of tree */ - ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); - BUG_ON(ret < 0); - } - if (ref->count && ref->parent) { - ret = ulist_add(refs, ref->parent, 0, GFP_NOFS); - BUG_ON(ret < 0); - } - kfree(ref); - } - -out: - if (head) - mutex_unlock(&head->mutex); - btrfs_free_path(path); - while (!list_empty(&prefs)) { - ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); - kfree(ref); - } - while (!list_empty(&prefs_delayed)) { - ref = list_first_entry(&prefs_delayed, struct __prelim_ref, - list); - list_del(&ref->list); - kfree(ref); - } - - return ret; -} - -/* - * Finds all leafs with a reference to the specified combination of bytenr and - * offset. key_list_head will point to a list of corresponding keys (caller must - * free each list element). The leafs will be stored in the leafs ulist, which - * must be freed with ulist_free. - * - * returns 0 on success, <0 on error - */ -static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **leafs) -{ - struct ulist *tmp; - int ret; - - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) { - ulist_free(tmp); - return -ENOMEM; - } - - ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp); - ulist_free(tmp); - - if (ret < 0 && ret != -ENOENT) { - ulist_free(*leafs); - return ret; - } - - return 0; -} - -/* - * walk all backrefs for a given extent to find all roots that reference this - * extent. Walking a backref means finding all extents that reference this - * extent and in turn walk the backrefs of those, too. Naturally this is a - * recursive process, but here it is implemented in an iterative fashion: We - * find all referencing extents for the extent in question and put them on a - * list. In turn, we find all referencing extents for those, further appending - * to the list. The way we iterate the list allows adding more elements after - * the current while iterating. The process stops when we reach the end of the - * list. Found roots are added to the roots list. - * - * returns 0 on success, < 0 on error. - */ -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots) -{ - struct ulist *tmp; - struct ulist_node *node = NULL; - int ret; - - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - *roots = ulist_alloc(GFP_NOFS); - if (!*roots) { - ulist_free(tmp); - return -ENOMEM; - } - - while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, seq, - tmp, *roots); - if (ret < 0 && ret != -ENOENT) { - ulist_free(tmp); - ulist_free(*roots); - return ret; - } - node = ulist_next(tmp, node); - if (!node) - break; - bytenr = node->val; - } - - ulist_free(tmp); - return 0; -} - - -static int __inode_info(u64 inum, u64 ioff, u8 key_type, - struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_key *found_key) -{ - int ret; - struct btrfs_key key; - struct extent_buffer *eb; - - key.type = key_type; - key.objectid = inum; - key.offset = ioff; - - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if (ret < 0) - return ret; - - eb = path->nodes[0]; - if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(fs_root, path); - if (ret) - return ret; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); - if (found_key->type != key.type || found_key->objectid != key.objectid) - return 1; - - return 0; -} - -/* - * this makes the path point to (inum INODE_ITEM ioff) - */ -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path) -{ - struct btrfs_key key; - return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, - &key); -} - -static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_key *found_key) -{ - return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, - found_key); -} - -/* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) -{ - u32 len; - int slot; - u64 next_inum; - int ret; - s64 bytes_left = size - 1; - struct extent_buffer *eb = eb_in; - struct btrfs_key found_key; - int leave_spinning = path->leave_spinning; - - if (bytes_left >= 0) - dest[bytes_left] = '\0'; - - path->leave_spinning = 1; - while (1) { - len = btrfs_inode_ref_name_len(eb, iref); - bytes_left -= len; - if (bytes_left >= 0) - read_extent_buffer(eb, dest + bytes_left, - (unsigned long)(iref + 1), len); - if (eb != eb_in) { - btrfs_tree_read_unlock_blocking(eb); - free_extent_buffer(eb); - } - ret = inode_ref_info(parent, 0, fs_root, path, &found_key); - if (ret > 0) - ret = -ENOENT; - if (ret) - break; - next_inum = found_key.offset; - - /* regular exit ahead */ - if (parent == next_inum) - break; - - slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - if (eb != eb_in) { - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); - } - btrfs_release_path(path); - - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - parent = next_inum; - --bytes_left; - if (bytes_left >= 0) - dest[bytes_left] = '/'; - } - - btrfs_release_path(path); - path->leave_spinning = leave_spinning; - - if (ret) - return ERR_PTR(ret); - - return dest + bytes_left; -} - -/* - * this makes the path point to (logical EXTENT_ITEM *) - * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for - * tree blocks and <0 on error. - */ -int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key) -{ - int ret; - u64 flags; - u32 item_size; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_key key; - - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); - if (ret < 0) - return ret; - ret = btrfs_previous_item(fs_info->extent_root, path, - 0, BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - return ret; - - btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); - if (found_key->type != BTRFS_EXTENT_ITEM_KEY || - found_key->objectid > logical || - found_key->objectid + found_key->offset <= logical) { - pr_debug("logical %llu is not within any extent\n", - (unsigned long long)logical); - return -ENOENT; - } - - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); - BUG_ON(item_size < sizeof(*ei)); - - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - flags = btrfs_extent_flags(eb, ei); - - pr_debug("logical %llu is at position %llu within the extent (%llu " - "EXTENT_ITEM %llu) flags %#llx size %u\n", - (unsigned long long)logical, - (unsigned long long)(logical - found_key->objectid), - (unsigned long long)found_key->objectid, - (unsigned long long)found_key->offset, - (unsigned long long)flags, item_size); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - return BTRFS_EXTENT_FLAG_TREE_BLOCK; - if (flags & BTRFS_EXTENT_FLAG_DATA) - return BTRFS_EXTENT_FLAG_DATA; - - return -EIO; -} - -/* - * helper function to iterate extent inline refs. ptr must point to a 0 value - * for the first call and may be modified. it is used to track state. - * if more refs exist, 0 is returned and the next call to - * __get_extent_inline_ref must pass the modified ptr parameter to get the - * next ref. after the last ref was processed, 1 is returned. - * returns <0 on error - */ -static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - struct btrfs_extent_inline_ref **out_eiref, - int *out_type) -{ - unsigned long end; - u64 flags; - struct btrfs_tree_block_info *info; - - if (!*ptr) { - /* first call */ - flags = btrfs_extent_flags(eb, ei); - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - info = (struct btrfs_tree_block_info *)(ei + 1); - *out_eiref = - (struct btrfs_extent_inline_ref *)(info + 1); - } else { - *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); - } - *ptr = (unsigned long)*out_eiref; - if ((void *)*ptr >= (void *)ei + item_size) - return -ENOENT; - } - - end = (unsigned long)ei + item_size; - *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; - *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); - - *ptr += btrfs_extent_inline_ref_size(*out_type); - WARN_ON(*ptr > end); - if (*ptr == end) - return 1; /* last */ - - return 0; -} - -/* - * reads the tree block backref for an extent. tree level and root are returned - * through out_level and out_root. ptr must point to a 0 value for the first - * call and may be modified (see __get_extent_inline_ref comment). - * returns 0 if data was provided, 1 if there was no more data to provide or - * <0 on error. - */ -int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level) -{ - int ret; - int type; - struct btrfs_tree_block_info *info; - struct btrfs_extent_inline_ref *eiref; - - if (*ptr == (unsigned long)-1) - return 1; - - while (1) { - ret = __get_extent_inline_ref(ptr, eb, ei, item_size, - &eiref, &type); - if (ret < 0) - return ret; - - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - break; - - if (ret == 1) - return 1; - } - - /* we can treat both ref types equally here */ - info = (struct btrfs_tree_block_info *)(ei + 1); - *out_root = btrfs_extent_inline_ref_offset(eb, eiref); - *out_level = btrfs_tree_block_level(eb, info); - - if (ret == 1) - *ptr = (unsigned long)-1; - - return 0; -} - -static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical, - u64 orig_extent_item_objectid, - u64 extent_item_pos, u64 root, - iterate_extent_inodes_t *iterate, void *ctx) -{ - u64 disk_byte; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *eb; - int slot; - int nritems; - int ret = 0; - int extent_type; - u64 data_offset; - u64 data_len; - - eb = read_tree_block(fs_info->tree_root, logical, - fs_info->tree_root->leafsize, 0); - if (!eb) - return -EIO; - - /* - * from the shared data ref, we only have the leaf but we need - * the key. thus, we must look into all items and see that we - * find one (some) with a reference to our extent item. - */ - nritems = btrfs_header_nritems(eb); - for (slot = 0; slot < nritems; ++slot) { - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != orig_extent_item_objectid) - continue; - - data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); - - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) - continue; - - pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), " - "root %llu\n", orig_extent_item_objectid, - key.objectid, key.offset, root); - ret = iterate(key.objectid, - key.offset + (extent_item_pos - data_offset), - root, ctx); - if (ret) { - pr_debug("stopping iteration because ret=%d\n", ret); - break; - } - } - - free_extent_buffer(eb); - - return ret; -} - -/* - * calls iterate() for every inode that references the extent identified by - * the given parameters. - * when the iterator function returns a non-zero value, iteration stops. - */ -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, u64 extent_item_pos, - int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx) -{ - int ret; - struct list_head data_refs = LIST_HEAD_INIT(data_refs); - struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); - struct btrfs_trans_handle *trans; - struct ulist *refs = NULL; - struct ulist *roots = NULL; - struct ulist_node *ref_node = NULL; - struct ulist_node *root_node = NULL; - struct seq_list seq_elem; - struct btrfs_delayed_ref_root *delayed_refs = NULL; - - pr_debug("resolving all inodes for extent %llu\n", - extent_item_objectid); - - if (search_commit_root) { - trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT; - } else { - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - btrfs_get_delayed_seq(delayed_refs, &seq_elem); - spin_unlock(&delayed_refs->lock); - } - - ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - extent_item_pos, seq_elem.seq, - &refs); - - if (ret) - goto out; - - while (!ret && (ref_node = ulist_next(refs, ref_node))) { - ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1, - seq_elem.seq, &roots); - if (ret) - break; - while (!ret && (root_node = ulist_next(roots, root_node))) { - pr_debug("root %llu references leaf %llu\n", - root_node->val, ref_node->val); - ret = iterate_leaf_refs(fs_info, ref_node->val, - extent_item_objectid, - extent_item_pos, root_node->val, - iterate, ctx); - } - } - - ulist_free(refs); - ulist_free(roots); -out: - if (!search_commit_root) { - btrfs_put_delayed_seq(delayed_refs, &seq_elem); - btrfs_end_transaction(trans, fs_info->extent_root); - } - - return ret; -} - -int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx) -{ - int ret; - u64 extent_item_pos; - struct btrfs_key found_key; - int search_commit_root = path->search_commit_root; - - ret = extent_from_logical(fs_info, logical, path, - &found_key); - btrfs_release_path(path); - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = -EINVAL; - if (ret < 0) - return ret; - - extent_item_pos = logical - found_key.objectid; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, search_commit_root, - iterate, ctx); - - return ret; -} - -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) -{ - int ret = 0; - int slot; - u32 cur; - u32 len; - u32 name_len; - u64 parent = 0; - int found = 0; - struct extent_buffer *eb; - struct btrfs_item *item; - struct btrfs_inode_ref *iref; - struct btrfs_key found_key; - - while (!ret) { - path->leave_spinning = 1; - ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); - if (ret < 0) - break; - if (ret) { - ret = found ? 0 : -ENOENT; - break; - } - ++found; - - parent = found_key.offset; - slot = path->slots[0]; - eb = path->nodes[0]; - /* make sure we can use eb after releasing the path */ - atomic_inc(&eb->refs); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); - btrfs_release_path(path); - - item = btrfs_item_nr(eb, slot); - iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); - - for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { - name_len = btrfs_inode_ref_name_len(eb, iref); - /* path must be released before calling iterate()! */ - pr_debug("following ref at offset %u for inode %llu in " - "tree %llu\n", cur, - (unsigned long long)found_key.objectid, - (unsigned long long)fs_root->objectid); - ret = iterate(parent, iref, eb, ctx); - if (ret) - break; - len = sizeof(*iref) + name_len; - iref = (struct btrfs_inode_ref *)((char *)iref + len); - } - btrfs_tree_read_unlock_blocking(eb); - free_extent_buffer(eb); - } - - btrfs_release_path(path); - - return ret; -} - -/* - * returns 0 if the path could be dumped (probably truncated) - * returns <0 in case of an error - */ -static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx) -{ - struct inode_fs_paths *ipath = ctx; - char *fspath; - char *fspath_min; - int i = ipath->fspath->elem_cnt; - const int s_ptr = sizeof(char *); - u32 bytes_left; - - bytes_left = ipath->fspath->bytes_left > s_ptr ? - ipath->fspath->bytes_left - s_ptr : 0; - - fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; - fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, - inum, fspath_min, bytes_left); - if (IS_ERR(fspath)) - return PTR_ERR(fspath); - - if (fspath > fspath_min) { - pr_debug("path resolved: %s\n", fspath); - ipath->fspath->val[i] = (u64)(unsigned long)fspath; - ++ipath->fspath->elem_cnt; - ipath->fspath->bytes_left = fspath - fspath_min; - } else { - pr_debug("missed path, not enough space. missing bytes: %lu, " - "constructed so far: %s\n", - (unsigned long)(fspath_min - fspath), fspath_min); - ++ipath->fspath->elem_missed; - ipath->fspath->bytes_missing += fspath_min - fspath; - ipath->fspath->bytes_left = 0; - } - - return 0; -} - -/* - * this dumps all file system paths to the inode into the ipath struct, provided - * is has been created large enough. each path is zero-terminated and accessed - * from ipath->fspath->val[i]. - * when it returns, there are ipath->fspath->elem_cnt number of paths available - * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the - * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, - * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would - * have been needed to return all paths. - */ -int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) -{ - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); -} - -struct btrfs_data_container *init_data_container(u32 total_bytes) -{ - struct btrfs_data_container *data; - size_t alloc_bytes; - - alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); - data = kmalloc(alloc_bytes, GFP_NOFS); - if (!data) - return ERR_PTR(-ENOMEM); - - if (total_bytes >= sizeof(*data)) { - data->bytes_left = total_bytes - sizeof(*data); - data->bytes_missing = 0; - } else { - data->bytes_missing = sizeof(*data) - total_bytes; - data->bytes_left = 0; - } - - data->elem_cnt = 0; - data->elem_missed = 0; - - return data; -} - -/* - * allocates space to return multiple file system paths for an inode. - * total_bytes to allocate are passed, note that space usable for actual path - * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. - */ -struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, - struct btrfs_path *path) -{ - struct inode_fs_paths *ifp; - struct btrfs_data_container *fspath; - - fspath = init_data_container(total_bytes); - if (IS_ERR(fspath)) - return (void *)fspath; - - ifp = kmalloc(sizeof(*ifp), GFP_NOFS); - if (!ifp) { - kfree(fspath); - return ERR_PTR(-ENOMEM); - } - - ifp->btrfs_path = path; - ifp->fspath = fspath; - ifp->fs_root = fs_root; - - return ifp; -} - -void free_ipath(struct inode_fs_paths *ipath) -{ - if (!ipath) - return; - kfree(ipath->fspath); - kfree(ipath); -} diff --git a/ANDROID_3.4.5/fs/btrfs/backref.h b/ANDROID_3.4.5/fs/btrfs/backref.h deleted file mode 100644 index 57ea2e95..00000000 --- a/ANDROID_3.4.5/fs/btrfs/backref.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_BACKREF__ -#define __BTRFS_BACKREF__ - -#include "ioctl.h" -#include "ulist.h" - -#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0) - -struct inode_fs_paths { - struct btrfs_path *btrfs_path; - struct btrfs_root *fs_root; - struct btrfs_data_container *fspath; -}; - -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, - void *ctx); -typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, - struct extent_buffer *eb, void *ctx); - -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path); - -int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, - struct btrfs_path *path, struct btrfs_key *found_key); - -int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_extent_item *ei, u32 item_size, - u64 *out_root, u8 *out_level); - -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, - u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx); - -int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx); - -int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); - -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, u64 seq, struct ulist **roots); - -struct btrfs_data_container *init_data_container(u32 total_bytes); -struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, - struct btrfs_path *path); -void free_ipath(struct inode_fs_paths *ipath); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/btrfs_inode.h b/ANDROID_3.4.5/fs/btrfs/btrfs_inode.h deleted file mode 100644 index 9b9b15fd..00000000 --- a/ANDROID_3.4.5/fs/btrfs/btrfs_inode.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_I__ -#define __BTRFS_I__ - -#include "extent_map.h" -#include "extent_io.h" -#include "ordered-data.h" -#include "delayed-inode.h" - -/* in memory btrfs inode */ -struct btrfs_inode { - /* which subvolume this inode belongs to */ - struct btrfs_root *root; - - /* key used to find this inode on disk. This is used by the code - * to read in roots of subvolumes - */ - struct btrfs_key location; - - /* Lock for counters */ - spinlock_t lock; - - /* the extent_tree has caches of all the extent mappings to disk */ - struct extent_map_tree extent_tree; - - /* the io_tree does range state (DIRTY, LOCKED etc) */ - struct extent_io_tree io_tree; - - /* special utility tree used to record which mirrors have already been - * tried when checksums fail for a given block - */ - struct extent_io_tree io_failure_tree; - - /* held while logging the inode in tree-log.c */ - struct mutex log_mutex; - - /* held while doing delalloc reservations */ - struct mutex delalloc_mutex; - - /* used to order data wrt metadata */ - struct btrfs_ordered_inode_tree ordered_tree; - - /* for keeping track of orphaned inodes */ - struct list_head i_orphan; - - /* list of all the delalloc inodes in the FS. There are times we need - * to write all the delalloc pages to disk, and this list is used - * to walk them all. - */ - struct list_head delalloc_inodes; - - /* - * list for tracking inodes that must be sent to disk before a - * rename or truncate commit - */ - struct list_head ordered_operations; - - /* node for the red-black tree that links inodes in subvolume root */ - struct rb_node rb_node; - - /* the space_info for where this inode's data allocations are done */ - struct btrfs_space_info *space_info; - - /* full 64 bit generation number, struct vfs_inode doesn't have a big - * enough field for this. - */ - u64 generation; - - /* sequence number for NFS changes */ - u64 sequence; - - /* - * transid of the trans_handle that last modified this inode - */ - u64 last_trans; - - /* - * log transid when this inode was last modified - */ - u64 last_sub_trans; - - /* - * transid that last logged this inode - */ - u64 logged_trans; - - /* total number of bytes pending delalloc, used by stat to calc the - * real block usage of the file - */ - u64 delalloc_bytes; - - /* - * the size of the file stored in the metadata on disk. data=ordered - * means the in-memory i_size might be larger than the size on disk - * because not all the blocks are written yet. - */ - u64 disk_i_size; - - /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created - */ - u64 index_cnt; - - /* the fsync log has some corner cases that mean we have to check - * directories to see if any unlinks have been done before - * the directory was logged. See tree-log.c for all the - * details - */ - u64 last_unlink_trans; - - /* - * Number of bytes outstanding that are going to need csums. This is - * used in ENOSPC accounting. - */ - u64 csum_bytes; - - /* flags field from the on disk inode */ - u32 flags; - - /* - * Counters to keep track of the number of extent item's we may use due - * to delalloc and such. outstanding_extents is the number of extent - * items we think we'll end up using, and reserved_extents is the number - * of extent items we've reserved metadata for. - */ - unsigned outstanding_extents; - unsigned reserved_extents; - - /* - * ordered_data_close is set by truncate when a file that used - * to have good data has been truncated to zero. When it is set - * the btrfs file release call will add this inode to the - * ordered operations list so that we make sure to flush out any - * new data the application may have written before commit. - */ - unsigned ordered_data_close:1; - unsigned orphan_meta_reserved:1; - unsigned dummy_inode:1; - unsigned in_defrag:1; - unsigned delalloc_meta_reserved:1; - - /* - * always compress this one file - */ - unsigned force_compress:4; - - struct btrfs_delayed_node *delayed_node; - - struct inode vfs_inode; -}; - -extern unsigned char btrfs_filetype_table[]; - -static inline struct btrfs_inode *BTRFS_I(struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} - -static inline u64 btrfs_ino(struct inode *inode) -{ - u64 ino = BTRFS_I(inode)->location.objectid; - - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY) - ino = inode->i_ino; - return ino; -} - -static inline void btrfs_i_size_write(struct inode *inode, u64 size) -{ - i_size_write(inode, size); - BTRFS_I(inode)->disk_i_size = size; -} - -static inline bool btrfs_is_free_space_inode(struct btrfs_root *root, - struct inode *inode) -{ - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; - return false; -} - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/check-integrity.c b/ANDROID_3.4.5/fs/btrfs/check-integrity.c deleted file mode 100644 index c053e90f..00000000 --- a/ANDROID_3.4.5/fs/btrfs/check-integrity.c +++ /dev/null @@ -1,3068 +0,0 @@ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -/* - * This module can be used to catch cases when the btrfs kernel - * code executes write requests to the disk that bring the file - * system in an inconsistent state. In such a state, a power-loss - * or kernel panic event would cause that the data on disk is - * lost or at least damaged. - * - * Code is added that examines all block write requests during - * runtime (including writes of the super block). Three rules - * are verified and an error is printed on violation of the - * rules: - * 1. It is not allowed to write a disk block which is - * currently referenced by the super block (either directly - * or indirectly). - * 2. When a super block is written, it is verified that all - * referenced (directly or indirectly) blocks fulfill the - * following requirements: - * 2a. All referenced blocks have either been present when - * the file system was mounted, (i.e., they have been - * referenced by the super block) or they have been - * written since then and the write completion callback - * was called and a FLUSH request to the device where - * these blocks are located was received and completed. - * 2b. All referenced blocks need to have a generation - * number which is equal to the parent's number. - * - * One issue that was found using this module was that the log - * tree on disk became temporarily corrupted because disk blocks - * that had been in use for the log tree had been freed and - * reused too early, while being referenced by the written super - * block. - * - * The search term in the kernel log that can be used to filter - * on the existence of detected integrity issues is - * "btrfs: attempt". - * - * The integrity check is enabled via mount options. These - * mount options are only supported if the integrity check - * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. - * - * Example #1, apply integrity checks to all metadata: - * mount /dev/sdb1 /mnt -o check_int - * - * Example #2, apply integrity checks to all metadata and - * to data extents: - * mount /dev/sdb1 /mnt -o check_int_data - * - * Example #3, apply integrity checks to all metadata and dump - * the tree that the super block references to kernel messages - * each time after a super block was written: - * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 - * - * If the integrity check tool is included and activated in - * the mount options, plenty of kernel memory is used, and - * plenty of additional CPU cycles are spent. Enabling this - * functionality is not intended for normal use. In most - * cases, unless you are a btrfs developer who needs to verify - * the integrity of (super)-block write requests, do not - * enable the config option BTRFS_FS_CHECK_INTEGRITY to - * include and compile the integrity check tool. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/buffer_head.h> -#include <linux/mutex.h> -#include <linux/crc32c.h> -#include <linux/genhd.h> -#include <linux/blkdev.h> -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "extent_io.h" -#include "volumes.h" -#include "print-tree.h" -#include "locking.h" -#include "check-integrity.h" - -#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 -#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 -#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 -#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 -#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 -#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, - * excluding " [...]" */ -#define BTRFSIC_BLOCK_SIZE PAGE_SIZE - -#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) - -/* - * The definition of the bitmask fields for the print_mask. - * They are specified with the mount option check_integrity_print_mask. - */ -#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 -#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 -#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 -#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 -#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 -#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 -#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 -#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 -#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 -#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 -#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 -#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 - -struct btrfsic_dev_state; -struct btrfsic_state; - -struct btrfsic_block { - u32 magic_num; /* only used for debug purposes */ - unsigned int is_metadata:1; /* if it is meta-data, not data-data */ - unsigned int is_superblock:1; /* if it is one of the superblocks */ - unsigned int is_iodone:1; /* if is done by lower subsystem */ - unsigned int iodone_w_error:1; /* error was indicated to endio */ - unsigned int never_written:1; /* block was added because it was - * referenced, not because it was - * written */ - unsigned int mirror_num:2; /* large enough to hold - * BTRFS_SUPER_MIRROR_MAX */ - struct btrfsic_dev_state *dev_state; - u64 dev_bytenr; /* key, physical byte num on disk */ - u64 logical_bytenr; /* logical byte num on disk */ - u64 generation; - struct btrfs_disk_key disk_key; /* extra info to print in case of - * issues, will not always be correct */ - struct list_head collision_resolving_node; /* list node */ - struct list_head all_blocks_node; /* list node */ - - /* the following two lists contain block_link items */ - struct list_head ref_to_list; /* list */ - struct list_head ref_from_list; /* list */ - struct btrfsic_block *next_in_same_bio; - void *orig_bio_bh_private; - union { - bio_end_io_t *bio; - bh_end_io_t *bh; - } orig_bio_bh_end_io; - int submit_bio_bh_rw; - u64 flush_gen; /* only valid if !never_written */ -}; - -/* - * Elements of this type are allocated dynamically and required because - * each block object can refer to and can be ref from multiple blocks. - * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block refered from. - * The fact that they are searchable via a hashtable and that a - * ref_cnt is maintained is not required for the btrfs integrity - * check algorithm itself, it is only used to make the output more - * beautiful in case that an error is detected (an error is defined - * as a write operation to a block while that block is still referenced). - */ -struct btrfsic_block_link { - u32 magic_num; /* only used for debug purposes */ - u32 ref_cnt; - struct list_head node_ref_to; /* list node */ - struct list_head node_ref_from; /* list node */ - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block *block_ref_to; - struct btrfsic_block *block_ref_from; - u64 parent_generation; -}; - -struct btrfsic_dev_state { - u32 magic_num; /* only used for debug purposes */ - struct block_device *bdev; - struct btrfsic_state *state; - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block dummy_block_for_bio_bh_flush; - u64 last_flush_gen; - char name[BDEVNAME_SIZE]; -}; - -struct btrfsic_block_hashtable { - struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_link_hashtable { - struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; -}; - -struct btrfsic_dev_state_hashtable { - struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_data_ctx { - u64 start; /* virtual bytenr */ - u64 dev_bytenr; /* physical bytenr on device */ - u32 len; - struct btrfsic_dev_state *dev; - char *data; - struct buffer_head *bh; /* do not use if set to NULL */ -}; - -/* This structure is used to implement recursion without occupying - * any stack space, refer to btrfsic_process_metablock() */ -struct btrfsic_stack_frame { - u32 magic; - u32 nr; - int error; - int i; - int limit_nesting; - int num_copies; - int mirror_num; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx *block_ctx; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfs_header *hdr; - struct btrfsic_stack_frame *prev; -}; - -/* Some state per mounted filesystem */ -struct btrfsic_state { - u32 print_mask; - int include_extent_data; - int csum_size; - struct list_head all_blocks_list; - struct btrfsic_block_hashtable block_hashtable; - struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_root *root; - u64 max_superblock_generation; - struct btrfsic_block *latest_superblock; -}; - -static void btrfsic_block_init(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_alloc(void); -static void btrfsic_block_free(struct btrfsic_block *b); -static void btrfsic_block_link_init(struct btrfsic_block_link *n); -static struct btrfsic_block_link *btrfsic_block_link_alloc(void); -static void btrfsic_block_link_free(struct btrfsic_block_link *n); -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, - struct btrfsic_dev_state_hashtable *h); -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices); -static int btrfsic_process_metablock(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - struct btrfs_header *hdr, - int limit_nesting, int force_iodone_flag); -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx - *block_ctx, u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation); -static int btrfsic_handle_extent_data(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag); -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num); -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out); -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx); -static void btrfsic_dump_database(struct btrfsic_state *state); -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size); -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, u8 *mapped_data, - unsigned int len, struct bio *bio, - int *bio_is_patched, - struct buffer_head *bh, - int submit_bio_bh_rw); -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const block, - struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); -static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level); -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level); -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block); -static void btrfsic_dump_tree(const struct btrfsic_state *state); -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level); -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation); -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created); -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev); -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data); - -static struct mutex btrfsic_mutex; -static int btrfsic_is_initialized; -static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; - - -static void btrfsic_block_init(struct btrfsic_block *b) -{ - b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; - b->dev_state = NULL; - b->dev_bytenr = 0; - b->logical_bytenr = 0; - b->generation = BTRFSIC_GENERATION_UNKNOWN; - b->disk_key.objectid = 0; - b->disk_key.type = 0; - b->disk_key.offset = 0; - b->is_metadata = 0; - b->is_superblock = 0; - b->is_iodone = 0; - b->iodone_w_error = 0; - b->never_written = 0; - b->mirror_num = 0; - b->next_in_same_bio = NULL; - b->orig_bio_bh_private = NULL; - b->orig_bio_bh_end_io.bio = NULL; - INIT_LIST_HEAD(&b->collision_resolving_node); - INIT_LIST_HEAD(&b->all_blocks_node); - INIT_LIST_HEAD(&b->ref_to_list); - INIT_LIST_HEAD(&b->ref_from_list); - b->submit_bio_bh_rw = 0; - b->flush_gen = 0; -} - -static struct btrfsic_block *btrfsic_block_alloc(void) -{ - struct btrfsic_block *b; - - b = kzalloc(sizeof(*b), GFP_NOFS); - if (NULL != b) - btrfsic_block_init(b); - - return b; -} - -static void btrfsic_block_free(struct btrfsic_block *b) -{ - BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); - kfree(b); -} - -static void btrfsic_block_link_init(struct btrfsic_block_link *l) -{ - l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; - l->ref_cnt = 1; - INIT_LIST_HEAD(&l->node_ref_to); - INIT_LIST_HEAD(&l->node_ref_from); - INIT_LIST_HEAD(&l->collision_resolving_node); - l->block_ref_to = NULL; - l->block_ref_from = NULL; -} - -static struct btrfsic_block_link *btrfsic_block_link_alloc(void) -{ - struct btrfsic_block_link *l; - - l = kzalloc(sizeof(*l), GFP_NOFS); - if (NULL != l) - btrfsic_block_link_init(l); - - return l; -} - -static void btrfsic_block_link_free(struct btrfsic_block_link *l) -{ - BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); - kfree(l); -} - -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) -{ - ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; - ds->bdev = NULL; - ds->state = NULL; - ds->name[0] = '\0'; - INIT_LIST_HEAD(&ds->collision_resolving_node); - ds->last_flush_gen = 0; - btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); - ds->dummy_block_for_bio_bh_flush.is_iodone = 1; - ds->dummy_block_for_bio_bh_flush.dev_state = ds; -} - -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) -{ - struct btrfsic_dev_state *ds; - - ds = kzalloc(sizeof(*ds), GFP_NOFS); - if (NULL != ds) - btrfsic_dev_state_init(ds); - - return ds; -} - -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) -{ - BUG_ON(!(NULL == ds || - BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); - kfree(ds); -} - -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(b->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)b->dev_state->bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - - list_add(&b->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) -{ - list_del(&b->collision_resolving_node); -} - -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block *const b = - list_entry(elem, struct btrfsic_block, - collision_resolving_node); - - if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) - return b; - } - - return NULL; -} - -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ - ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ - ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) - & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - list_add(&l->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) -{ - list_del(&l->collision_resolving_node); -} - -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ - ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ - ((unsigned int)((uintptr_t)bdev_ref_to)) ^ - ((unsigned int)((uintptr_t)bdev_ref_from))) & - (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block_link *const l = - list_entry(elem, struct btrfsic_block_link, - collision_resolving_node); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - if (l->block_ref_to->dev_state->bdev == bdev_ref_to && - l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && - l->block_ref_from->dev_state->bdev == bdev_ref_from && - l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) - return l; - } - - return NULL; -} - -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - - list_add(&ds->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) -{ - list_del(&ds->collision_resolving_node); -} - -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_dev_state *const ds = - list_entry(elem, struct btrfsic_dev_state, - collision_resolving_node); - - if (ds->bdev == bdev) - return ds; - } - - return NULL; -} - -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices) -{ - int ret = 0; - struct btrfs_super_block *selected_super; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - struct btrfsic_dev_state *selected_dev_state = NULL; - int pass; - - BUG_ON(NULL == state); - selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS); - if (NULL == selected_super) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return -1; - } - - list_for_each_entry(device, dev_head, dev_list) { - int i; - struct btrfsic_dev_state *dev_state; - - if (!device->bdev || !device->name) - continue; - - dev_state = btrfsic_dev_state_lookup(device->bdev); - BUG_ON(NULL == dev_state); - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - ret = btrfsic_process_superblock_dev_mirror( - state, dev_state, device, i, - &selected_dev_state, selected_super); - if (0 != ret && 0 == i) { - kfree(selected_super); - return ret; - } - } - } - - if (NULL == state->latest_superblock) { - printk(KERN_INFO "btrfsic: no superblock found!\n"); - kfree(selected_super); - return -1; - } - - state->csum_size = btrfs_super_csum_size(selected_super); - - for (pass = 0; pass < 3; pass++) { - int num_copies; - int mirror_num; - u64 next_bytenr; - - switch (pass) { - case 0: - next_bytenr = btrfs_super_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); - break; - case 1: - next_bytenr = btrfs_super_chunk_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); - break; - case 2: - next_bytenr = btrfs_super_log_root(selected_super); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); - break; - } - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - struct btrfs_header *hdr; - - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - printk(KERN_INFO "btrfsic:" - " btrfsic_map_block(root @%llu," - " mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - kfree(selected_super); - return -1; - } - - next_block = btrfsic_block_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - &state->block_hashtable); - BUG_ON(NULL == next_block); - - l = btrfsic_block_link_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - state->latest_superblock->dev_state-> - bdev, - state->latest_superblock->dev_bytenr, - &state->block_link_hashtable); - BUG_ON(NULL == l); - - ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { - printk(KERN_INFO - "btrfsic: read @logical %llu failed!\n", - (unsigned long long) - tmp_next_block_ctx.start); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - kfree(selected_super); - return -1; - } - - hdr = (struct btrfs_header *)tmp_next_block_ctx.data; - ret = btrfsic_process_metablock(state, - next_block, - &tmp_next_block_ctx, - hdr, - BTRFS_MAX_LEVEL + 3, 1); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - } - } - - kfree(selected_super); - return ret; -} - -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super) -{ - struct btrfs_super_block *super_tmp; - u64 dev_bytenr; - struct buffer_head *bh; - struct btrfsic_block *superblock_tmp; - int pass; - struct block_device *const superblock_bdev = device->bdev; - - /* super block bytenr is always the unmapped device bytenr */ - dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096); - if (NULL == bh) - return -1; - super_tmp = (struct btrfs_super_block *) - (bh->b_data + (dev_bytenr & 4095)); - - if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, - sizeof(super_tmp->magic)) || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) { - brelse(bh); - return 0; - } - - superblock_tmp = - btrfsic_block_hashtable_lookup(superblock_bdev, - dev_bytenr, - &state->block_hashtable); - if (NULL == superblock_tmp) { - superblock_tmp = btrfsic_block_alloc(); - if (NULL == superblock_tmp) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - brelse(bh); - return -1; - } - /* for superblock, only the dev_bytenr makes sense */ - superblock_tmp->dev_bytenr = dev_bytenr; - superblock_tmp->dev_state = dev_state; - superblock_tmp->logical_bytenr = dev_bytenr; - superblock_tmp->generation = btrfs_super_generation(super_tmp); - superblock_tmp->is_metadata = 1; - superblock_tmp->is_superblock = 1; - superblock_tmp->is_iodone = 1; - superblock_tmp->never_written = 0; - superblock_tmp->mirror_num = 1 + superblock_mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", - superblock_bdev, device->name, - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - superblock_mirror_num); - list_add(&superblock_tmp->all_blocks_node, - &state->all_blocks_list); - btrfsic_block_hashtable_add(superblock_tmp, - &state->block_hashtable); - } - - /* select the one with the highest generation field */ - if (btrfs_super_generation(super_tmp) > - state->max_superblock_generation || - 0 == state->max_superblock_generation) { - memcpy(selected_super, super_tmp, sizeof(*selected_super)); - *selected_dev_state = dev_state; - state->max_superblock_generation = - btrfs_super_generation(super_tmp); - state->latest_superblock = superblock_tmp; - } - - for (pass = 0; pass < 3; pass++) { - u64 next_bytenr; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - switch (pass) { - case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); - additional_string = "initial root "; - next_bytenr = btrfs_super_root(super_tmp); - break; - case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "initial chunk "; - next_bytenr = btrfs_super_chunk_root(super_tmp); - break; - case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); - additional_string = "initial log "; - next_bytenr = btrfs_super_log_root(super_tmp); - if (0 == next_bytenr) - continue; - break; - } - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE, - &tmp_next_block_ctx, - mirror_num)) { - printk(KERN_INFO "btrfsic: btrfsic_map_block(" - "bytenr @%llu, mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - brelse(bh); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, &tmp_next_block_ctx, - additional_string, 1, 1, 0, - mirror_num, NULL); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - brelse(bh); - return -1; - } - - next_block->disk_key = tmp_disk_key; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, &tmp_next_block_ctx, - next_block, superblock_tmp, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) { - brelse(bh); - return -1; - } - } - } - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) - btrfsic_dump_tree_sub(state, superblock_tmp, 0); - - brelse(bh); - return 0; -} - -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) -{ - struct btrfsic_stack_frame *sf; - - sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (NULL == sf) - printk(KERN_INFO "btrfsic: alloc memory failed!\n"); - else - sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; - return sf; -} - -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) -{ - BUG_ON(!(NULL == sf || - BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); - kfree(sf); -} - -static int btrfsic_process_metablock( - struct btrfsic_state *state, - struct btrfsic_block *const first_block, - struct btrfsic_block_data_ctx *const first_block_ctx, - struct btrfs_header *const first_hdr, - int first_limit_nesting, int force_iodone_flag) -{ - struct btrfsic_stack_frame initial_stack_frame = { 0 }; - struct btrfsic_stack_frame *sf; - struct btrfsic_stack_frame *next_stack; - - sf = &initial_stack_frame; - sf->error = 0; - sf->i = -1; - sf->limit_nesting = first_limit_nesting; - sf->block = first_block; - sf->block_ctx = first_block_ctx; - sf->next_block = NULL; - sf->hdr = first_hdr; - sf->prev = NULL; - -continue_with_new_stack_frame: - sf->block->generation = le64_to_cpu(sf->hdr->generation); - if (0 == sf->hdr->level) { - struct btrfs_leaf *const leafhdr = - (struct btrfs_leaf *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = le32_to_cpu(leafhdr->header.nritems); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "leaf %llu items %d generation %llu" - " owner %llu\n", - (unsigned long long) - sf->block_ctx->start, - sf->nr, - (unsigned long long) - le64_to_cpu(leafhdr->header.generation), - (unsigned long long) - le64_to_cpu(leafhdr->header.owner)); - } - -continue_with_current_leaf_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_item *disk_item = leafhdr->items + sf->i; - struct btrfs_disk_key *disk_key = &disk_item->key; - u8 type; - const u32 item_offset = le32_to_cpu(disk_item->offset); - - type = disk_key->type; - - if (BTRFS_ROOT_ITEM_KEY == type) { - const struct btrfs_root_item *const root_item = - (struct btrfs_root_item *) - (sf->block_ctx->data + - offsetof(struct btrfs_leaf, items) + - item_offset); - const u64 next_bytenr = - le64_to_cpu(root_item->bytenr); - - sf->error = - btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - disk_key, - le64_to_cpu(root_item-> - generation)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.data; - - next_stack = - btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - btrfsic_release_block_ctx( - &sf-> - next_block_ctx); - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = - &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - } else if (BTRFS_EXTENT_DATA_KEY == type && - state->include_extent_data) { - sf->error = btrfsic_handle_extent_data( - state, - sf->block, - sf->block_ctx, - item_offset, - force_iodone_flag); - if (sf->error) - goto one_stack_frame_backwards; - } - - goto continue_with_current_leaf_stack_frame; - } - } else { - struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = le32_to_cpu(nodehdr->header.nritems); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO "node %llu level %d items %d" - " generation %llu owner %llu\n", - (unsigned long long) - sf->block_ctx->start, - nodehdr->header.level, sf->nr, - (unsigned long long) - le64_to_cpu(nodehdr->header.generation), - (unsigned long long) - le64_to_cpu(nodehdr->header.owner)); - } - -continue_with_current_node_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_key_ptr *disk_key_ptr = - nodehdr->ptrs + sf->i; - const u64 next_bytenr = - le64_to_cpu(disk_key_ptr->blockptr); - - sf->error = btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - &disk_key_ptr->key, - le64_to_cpu(disk_key_ptr->generation)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.data; - - next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) - goto one_stack_frame_backwards; - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - - goto continue_with_current_node_stack_frame; - } - } - -one_stack_frame_backwards: - if (NULL != sf->prev) { - struct btrfsic_stack_frame *const prev = sf->prev; - - /* the one for the initial block is freed in the caller */ - btrfsic_release_block_ctx(sf->block_ctx); - - if (sf->error) { - prev->error = sf->error; - btrfsic_stack_frame_free(sf); - sf = prev; - goto one_stack_frame_backwards; - } - - btrfsic_stack_frame_free(sf); - sf = prev; - goto continue_with_new_stack_frame; - } else { - BUG_ON(&initial_stack_frame != sf); - } - - return sf->error; -} - -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation) -{ - struct btrfsic_block *next_block = NULL; - int ret; - struct btrfsic_block_link *l; - int did_alloc_block_link; - int block_was_created; - - *next_blockp = NULL; - if (0 == *num_copiesp) { - *num_copiesp = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, *num_copiesp); - *mirror_nump = 1; - } - - if (*mirror_nump > *num_copiesp) - return 0; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic_create_link_to_next_block(mirror_num=%d)\n", - *mirror_nump); - ret = btrfsic_map_block(state, next_bytenr, - BTRFSIC_BLOCK_SIZE, - next_block_ctx, *mirror_nump); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - (unsigned long long)next_bytenr, *mirror_nump); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - next_block = btrfsic_block_lookup_or_add(state, - next_block_ctx, "referenced ", - 1, force_iodone_flag, - !force_iodone_flag, - *mirror_nump, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - if (block_was_created) { - l = NULL; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - } else { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch (!= stored %llu).\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, - btrfsic_get_block_type(state, next_block), - (unsigned long long)next_block->logical_bytenr); - } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, - btrfsic_get_block_type(state, next_block)); - next_block->logical_bytenr = next_bytenr; - - next_block->mirror_num = *mirror_nump; - l = btrfsic_block_link_hashtable_lookup( - next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_link_hashtable); - } - - next_block->disk_key = *disk_key; - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - did_alloc_block_link = 1; - l->block_ref_to = next_block; - l->block_ref_from = block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - did_alloc_block_link = 0; - if (0 == limit_nesting) { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - } - - if (limit_nesting > 0 && did_alloc_block_link) { - ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)BTRFSIC_BLOCK_SIZE) { - printk(KERN_INFO - "btrfsic: read block @logical %llu failed!\n", - (unsigned long long)next_bytenr); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - *next_blockp = next_block; - } else { - *next_blockp = NULL; - } - (*mirror_nump)++; - - return 0; -} - -static int btrfsic_handle_extent_data( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag) -{ - int ret; - struct btrfs_file_extent_item *file_extent_item = - (struct btrfs_file_extent_item *)(block_ctx->data + - offsetof(struct btrfs_leaf, - items) + item_offset); - u64 next_bytenr = - le64_to_cpu(file_extent_item->disk_bytenr) + - le64_to_cpu(file_extent_item->offset); - u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes); - u64 generation = le64_to_cpu(file_extent_item->generation); - struct btrfsic_block_link *l; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," - " offset = %llu, num_bytes = %llu\n", - file_extent_item->type, - (unsigned long long) - le64_to_cpu(file_extent_item->disk_bytenr), - (unsigned long long) - le64_to_cpu(file_extent_item->offset), - (unsigned long long) - le64_to_cpu(file_extent_item->num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item->type || - ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr)) - return 0; - while (num_bytes > 0) { - u32 chunk_len; - int num_copies; - int mirror_num; - - if (num_bytes > BTRFSIC_BLOCK_SIZE) - chunk_len = BTRFSIC_BLOCK_SIZE; - else - chunk_len = num_bytes; - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfsic_block *next_block; - int block_was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO "btrfsic_handle_extent_data(" - "mirror_num=%d)\n", mirror_num); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - printk(KERN_INFO - "\tdisk_bytenr = %llu, num_bytes %u\n", - (unsigned long long)next_bytenr, - chunk_len); - ret = btrfsic_map_block(state, next_bytenr, - chunk_len, &next_block_ctx, - mirror_num); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(@%llu," - " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &next_block_ctx, - "referenced ", - 0, - force_iodone_flag, - !force_iodone_flag, - mirror_num, - &block_was_created); - if (NULL == next_block) { - printk(KERN_INFO - "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(&next_block_ctx); - return -1; - } - if (!block_was_created) { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block" - " @%llu (%s/%llu/%d)" - " found in hash table, D," - " bytenr mismatch" - " (!= stored %llu).\n", - (unsigned long long)next_bytenr, - next_block_ctx.dev->name, - (unsigned long long) - next_block_ctx.dev_bytenr, - mirror_num, - (unsigned long long) - next_block->logical_bytenr); - } - next_block->logical_bytenr = next_bytenr; - next_block->mirror_num = mirror_num; - } - - l = btrfsic_block_link_lookup_or_add(state, - &next_block_ctx, - next_block, block, - generation); - btrfsic_release_block_ctx(&next_block_ctx); - if (NULL == l) - return -1; - } - - next_bytenr += chunk_len; - num_bytes -= chunk_len; - } - - return 0; -} - -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num) -{ - int ret; - u64 length; - struct btrfs_bio *multi = NULL; - struct btrfs_device *device; - - length = len; - ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, - bytenr, &length, &multi, mirror_num); - - device = multi->stripes[0].dev; - block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); - block_ctx_out->dev_bytenr = multi->stripes[0].physical; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; - - if (0 == ret) - kfree(multi); - if (NULL == block_ctx_out->dev) { - ret = -ENXIO; - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); - } - - return ret; -} - -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out) -{ - block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); - block_ctx_out->dev_bytenr = bytenr; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->data = NULL; - block_ctx_out->bh = NULL; - if (NULL != block_ctx_out->dev) { - return 0; - } else { - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); - return -ENXIO; - } -} - -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) -{ - if (NULL != block_ctx->bh) { - brelse(block_ctx->bh); - block_ctx->bh = NULL; - } -} - -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx) -{ - block_ctx->bh = NULL; - if (block_ctx->dev_bytenr & 4095) { - printk(KERN_INFO - "btrfsic: read_block() with unaligned bytenr %llu\n", - (unsigned long long)block_ctx->dev_bytenr); - return -1; - } - if (block_ctx->len > 4096) { - printk(KERN_INFO - "btrfsic: read_block() with too huge size %d\n", - block_ctx->len); - return -1; - } - - block_ctx->bh = __bread(block_ctx->dev->bdev, - block_ctx->dev_bytenr >> 12, 4096); - if (NULL == block_ctx->bh) - return -1; - block_ctx->data = block_ctx->bh->b_data; - - return block_ctx->len; -} - -static void btrfsic_dump_database(struct btrfsic_state *state) -{ - struct list_head *elem_all; - - BUG_ON(NULL == state); - - printk(KERN_INFO "all_blocks_list:\n"); - list_for_each(elem_all, &state->all_blocks_list) { - const struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *elem_ref_from; - - printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); - - list_for_each(elem_ref_to, &b_all->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - - printk(KERN_INFO " %c @%llu (%s/%llu/%d)" - " refers %u* to" - " %c @%llu (%s/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - } - - list_for_each(elem_ref_from, &b_all->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, - struct btrfsic_block_link, - node_ref_from); - - printk(KERN_INFO " %c @%llu (%s/%llu/%d)" - " is ref %u* from" - " %c @%llu (%s/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long) - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - } - - printk(KERN_INFO "\n"); - } -} - -/* - * Test whether the disk block contains a tree block (leaf or node) - * (note that this test fails for the super block) - */ -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - const u8 *data, unsigned int size) -{ - struct btrfs_header *h; - u8 csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; - - h = (struct btrfs_header *)data; - - if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) - fail++; - - crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, csum); - if (memcmp(csum, h->csum, state->csum_size)) - crc_fail++; - - return fail || crc_fail; -} - -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, - u8 *mapped_data, unsigned int len, - struct bio *bio, - int *bio_is_patched, - struct buffer_head *bh, - int submit_bio_bh_rw) -{ - int is_metadata; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx block_ctx; - int ret; - struct btrfsic_state *state = dev_state->state; - struct block_device *bdev = dev_state->bdev; - - WARN_ON(len > PAGE_SIZE); - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len)); - if (NULL != bio_is_patched) - *bio_is_patched = 0; - - block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, - &state->block_hashtable); - if (NULL != block) { - u64 bytenr = 0; - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - if (block->is_superblock) { - bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_data)->bytenr); - is_metadata = 1; - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { - printk(KERN_INFO - "[before new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } - if (is_metadata) { - if (!block->is_superblock) { - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, - dev_state, - dev_bytenr, - mapped_data); - } - if (block->logical_bytenr != bytenr) { - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch" - " (!= stored %llu).\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block), - (unsigned long long) - block->logical_bytenr); - block->logical_bytenr = bytenr; - } else if (state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } else { - bytenr = block->logical_bytenr; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "ref_to_list: %cE, ref_from_list: %cE\n", - list_empty(&block->ref_to_list) ? ' ' : '!', - list_empty(&block->ref_from_list) ? ' ' : '!'); - if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - printk(KERN_INFO "btrfs: attempt to overwrite %c-block" - " @%llu (%s/%llu/%d), old(gen=%llu," - " objectid=%llu, type=%d, offset=%llu)," - " new(gen=%llu)," - " which is referenced by most recent superblock" - " (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(block->disk_key.objectid), - block->disk_key.type, - (unsigned long long) - le64_to_cpu(block->disk_key.offset), - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation), - (unsigned long long) - state->max_superblock_generation); - btrfsic_dump_tree(state); - } - - if (!block->is_iodone && !block->never_written) { - printk(KERN_INFO "btrfs: attempt to overwrite %c-block" - " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," - " which is not yet iodone!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_data)->generation)); - /* it would not be safe to go on */ - btrfsic_dump_tree(state); - return; - } - - /* - * Clear all references of this block. Do not free - * the block itself even if is not referenced anymore - * because it still carries valueable information - * like whether it was ever written and IO completed. - */ - list_for_each_safe(elem_ref_to, tmp_ref_to, - &block->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - l->ref_cnt--; - if (0 == l->ref_cnt) { - list_del(&l->node_ref_to); - list_del(&l->node_ref_from); - btrfsic_block_link_hashtable_remove(l); - btrfsic_block_link_free(l); - } - } - - if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, len, - bdev, &block_ctx); - else - ret = btrfsic_map_block(state, bytenr, len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", (unsigned long long)bytenr); - return; - } - block_ctx.data = mapped_data; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - - if (is_metadata || state->include_extent_data) { - block->never_written = 0; - block->iodone_w_error = 0; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_bh_private = - bio->bi_private; - block->orig_bio_bh_end_io.bio = - bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io. - bio; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } else { - block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; - block->next_in_same_bio = NULL; - } - } - - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (is_metadata) { - block->logical_bytenr = bytenr; - block->is_metadata = 1; - if (block->is_superblock) { - ret = btrfsic_process_written_superblock( - state, - block, - (struct btrfs_super_block *) - mapped_data); - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { - printk(KERN_INFO - "[after new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } else { - block->mirror_num = 0; /* unknown */ - ret = btrfsic_process_metablock( - state, - block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, - 0, 0); - } - if (ret) - printk(KERN_INFO - "btrfsic: btrfsic_process_metablock" - "(root @%llu) failed!\n", - (unsigned long long)dev_bytenr); - } else { - block->is_metadata = 0; - block->mirror_num = 0; /* unknown */ - block->generation = BTRFSIC_GENERATION_UNKNOWN; - if (!state->include_extent_data - && list_empty(&block->ref_from_list)) { - /* - * disk block is overwritten with extent - * data (not meta data) and we are configured - * to not include extent data: take the - * chance and free the block's memory - */ - btrfsic_block_hashtable_remove(block); - list_del(&block->all_blocks_node); - btrfsic_block_free(block); - } - } - btrfsic_release_block_ctx(&block_ctx); - } else { - /* block has not been found in hash table */ - u64 bytenr; - - if (!is_metadata) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO "Written block (%s/%llu/?)" - " !found in hash table, D.\n", - dev_state->name, - (unsigned long long)dev_bytenr); - if (!state->include_extent_data) - return; /* ignore that written D block */ - - /* this is getting ugly for the - * include_extent_data case... */ - bytenr = 0; /* unknown */ - block_ctx.start = bytenr; - block_ctx.len = len; - block_ctx.bh = NULL; - } else { - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_data)->bytenr); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr, - mapped_data); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/?)" - " !found in hash table, M.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr); - - ret = btrfsic_map_block(state, bytenr, len, &block_ctx, - 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", - (unsigned long long)dev_bytenr); - return; - } - } - block_ctx.data = mapped_data; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - - block = btrfsic_block_alloc(); - if (NULL == block) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(&block_ctx); - return; - } - block->dev_state = dev_state; - block->dev_bytenr = dev_bytenr; - block->logical_bytenr = bytenr; - block->is_metadata = is_metadata; - block->never_written = 0; - block->iodone_w_error = 0; - block->mirror_num = 0; /* unknown */ - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_bh_private = - chained_block->orig_bio_bh_private; - block->orig_bio_bh_end_io.bio = - chained_block->orig_bio_bh_end_io.bio; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else if (NULL != bh) { - block->is_iodone = 0; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } else { - block->is_iodone = 1; - block->orig_bio_bh_private = NULL; - block->orig_bio_bh_end_io.bio = NULL; - block->next_in_same_bio = NULL; - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "New written %c-block @%llu (%s/%llu/%d)\n", - is_metadata ? 'M' : 'D', - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - - if (is_metadata) { - ret = btrfsic_process_metablock(state, block, - &block_ctx, - (struct btrfs_header *) - block_ctx.data, 0, 0); - if (ret) - printk(KERN_INFO - "btrfsic: process_metablock(root @%llu)" - " failed!\n", - (unsigned long long)dev_bytenr); - } - btrfsic_release_block_ctx(&block_ctx); - } -} - -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; - int iodone_w_error; - - /* mutex is not held! This is not save if IO is not yet completed - * on umount */ - iodone_w_error = 0; - if (bio_error_status) - iodone_w_error = 1; - - BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_bh_private; - bp->bi_end_io = block->orig_bio_bh_end_io.bio; - - do { - struct btrfsic_block *next_block; - struct btrfsic_dev_state *const dev_state = block->dev_state; - - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bio_error_status, - btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - next_block = block->next_in_same_bio; - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bio_end_io() new %s flush_gen=%llu\n", - dev_state->name, - (unsigned long long) - dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is - * on disk */ - block->is_iodone = 1; /* for FLUSH, this releases the block */ - block = next_block; - } while (NULL != block); - - bp->bi_end_io(bp, bio_error_status); -} - -static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) -{ - struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private; - int iodone_w_error = !uptodate; - struct btrfsic_dev_state *dev_state; - - BUG_ON(NULL == block); - dev_state = block->dev_state; - if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", - iodone_w_error, - btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - printk(KERN_INFO - "bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, - (unsigned long long)dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is on disk */ - - bh->b_private = block->orig_bio_bh_private; - bh->b_end_io = block->orig_bio_bh_end_io.bh; - block->is_iodone = 1; /* for FLUSH, this releases the block */ - bh->b_end_io(bh, uptodate); -} - -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const superblock, - struct btrfs_super_block *const super_hdr) -{ - int pass; - - superblock->generation = btrfs_super_generation(super_hdr); - if (!(superblock->generation > state->max_superblock_generation || - 0 == state->max_superblock_generation)) { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO - "btrfsic: superblock @%llu (%s/%llu/%d)" - " with old gen %llu <= %llu\n", - (unsigned long long)superblock->logical_bytenr, - superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) - btrfs_super_generation(super_hdr), - (unsigned long long) - state->max_superblock_generation); - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO - "btrfsic: got new superblock @%llu (%s/%llu/%d)" - " with new gen %llu > %llu\n", - (unsigned long long)superblock->logical_bytenr, - superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) - btrfs_super_generation(super_hdr), - (unsigned long long) - state->max_superblock_generation); - - state->max_superblock_generation = - btrfs_super_generation(super_hdr); - state->latest_superblock = superblock; - } - - for (pass = 0; pass < 3; pass++) { - int ret; - u64 next_bytenr; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - - switch (pass) { - case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); - additional_string = "root "; - next_bytenr = btrfs_super_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); - break; - case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "chunk "; - next_bytenr = btrfs_super_chunk_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); - break; - case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); - additional_string = "log "; - next_bytenr = btrfs_super_log_root(super_hdr); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); - break; - } - - num_copies = - btrfs_num_copies(&state->root->fs_info->mapping_tree, - next_bytenr, PAGE_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - int was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic_process_written_superblock(" - "mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(@%llu," - " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &tmp_next_block_ctx, - additional_string, - 1, 0, 1, - mirror_num, - &was_created); - if (NULL == next_block) { - printk(KERN_INFO - "btrfsic: error, kmalloc failed!\n"); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - return -1; - } - - next_block->disk_key = tmp_disk_key; - if (was_created) - next_block->generation = - BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, - &tmp_next_block_ctx, - next_block, - superblock, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) - return -1; - } - } - - if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) { - WARN_ON(1); - btrfsic_dump_tree(state); - } - - return 0; -} - -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level) -{ - struct list_head *elem_ref_to; - int ret = 0; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* - * Note that this situation can happen and does not - * indicate an error in regular cases. It happens - * when disk blocks are freed and later reused. - * The check-integrity module is not aware of any - * block free operations, it just recognizes block - * write operations. Therefore it keeps the linkage - * information for a block until a block is - * rewritten. This can temporarily cause incorrect - * and even circular linkage informations. This - * causes no harm unless such blocks are referenced - * by the most recent super block. - */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic: abort cyclic linkage (case 1).\n"); - - return ret; - } - - /* - * This algorithm is recursive because the amount of used stack - * space is very small and the max recursion depth is limited. - */ - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "rl=%d, %c @%llu (%s/%llu/%d)" - " %u* refers to %c @%llu (%s/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - if (l->block_ref_to->never_written) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " which is never written!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (!l->block_ref_to->is_iodone) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " which is not yet iodone!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->parent_generation != - l->block_ref_to->generation && - BTRFSIC_GENERATION_UNKNOWN != - l->parent_generation && - BTRFSIC_GENERATION_UNKNOWN != - l->block_ref_to->generation) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " with generation %llu !=" - " parent generation %llu!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - (unsigned long long)l->block_ref_to->generation, - (unsigned long long)l->parent_generation); - ret = -1; - } else if (l->block_ref_to->flush_gen > - l->block_ref_to->dev_state->last_flush_gen) { - printk(KERN_INFO "btrfs: attempt to write superblock" - " which references block %c @%llu (%s/%llu/%d)" - " which is not flushed out of disk's write cache" - " (block flush_gen=%llu," - " dev->flush_gen=%llu)!\n", - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - (unsigned long long)block->flush_gen, - (unsigned long long) - l->block_ref_to->dev_state->last_flush_gen); - ret = -1; - } else if (-1 == btrfsic_check_all_ref_blocks(state, - l->block_ref_to, - recursion_level + - 1)) { - ret = -1; - } - } - - return ret; -} - -static int btrfsic_is_block_ref_by_superblock( - const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level) -{ - struct list_head *elem_ref_from; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* refer to comment at "abort cyclic linkage (case 1)" */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "btrfsic: abort cyclic linkage (case 2).\n"); - - return 0; - } - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - list_for_each(elem_ref_from, &block->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, struct btrfsic_block_link, - node_ref_from); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "rl=%d, %c @%llu (%s/%llu/%d)" - " is ref %u* from %c @%llu (%s/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long) - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - if (l->block_ref_from->is_superblock && - state->latest_superblock->dev_bytenr == - l->block_ref_from->dev_bytenr && - state->latest_superblock->dev_state->bdev == - l->block_ref_from->dev_state->bdev) - return 1; - else if (btrfsic_is_block_ref_by_superblock(state, - l->block_ref_from, - recursion_level + - 1)) - return 1; - } - - return 0; -} - -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - printk(KERN_INFO - "Add %u* link from %c @%llu (%s/%llu/%d)" - " to %c @%llu (%s/%llu/%d).\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - printk(KERN_INFO - "Rem %u* link from %c @%llu (%s/%llu/%d)" - " to %c @%llu (%s/%llu/%d).\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block) -{ - if (block->is_superblock && - state->latest_superblock->dev_bytenr == block->dev_bytenr && - state->latest_superblock->dev_state->bdev == block->dev_state->bdev) - return 'S'; - else if (block->is_superblock) - return 's'; - else if (block->is_metadata) - return 'M'; - else - return 'D'; -} - -static void btrfsic_dump_tree(const struct btrfsic_state *state) -{ - btrfsic_dump_tree_sub(state, state->latest_superblock, 0); -} - -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level) -{ - struct list_head *elem_ref_to; - int indent_add; - static char buf[80]; - int cursor_position; - - /* - * Should better fill an on-stack buffer with a complete line and - * dump it at once when it is time to print a newline character. - */ - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); - if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - return; - } - printk(buf); - indent_level += indent_add; - if (list_empty(&block->ref_to_list)) { - printk("\n"); - return; - } - if (block->mirror_num > 1 && - !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { - printk(" [...]\n"); - return; - } - - cursor_position = indent_level; - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - - while (cursor_position < indent_level) { - printk(" "); - cursor_position++; - } - if (l->ref_cnt > 1) - indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); - else - indent_add = sprintf(buf, " --> "); - if (indent_level + indent_add > - BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - cursor_position = 0; - continue; - } - - printk(buf); - - btrfsic_dump_tree_sub(state, l->block_ref_to, - indent_level + indent_add); - cursor_position = 0; - } -} - -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation) -{ - struct btrfsic_block_link *l; - - l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - from_block->dev_state->bdev, - from_block->dev_bytenr, - &state->block_link_hashtable); - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - printk(KERN_INFO - "btrfsic: error, kmalloc" " failed!\n"); - return NULL; - } - - l->block_ref_to = next_block; - l->block_ref_from = from_block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &from_block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - - return l; -} - -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created) -{ - struct btrfsic_block *block; - - block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_hashtable); - if (NULL == block) { - struct btrfsic_dev_state *dev_state; - - block = btrfsic_block_alloc(); - if (NULL == block) { - printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return NULL; - } - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); - if (NULL == dev_state) { - printk(KERN_INFO - "btrfsic: error, lookup dev_state failed!\n"); - btrfsic_block_free(block); - return NULL; - } - block->dev_state = dev_state; - block->dev_bytenr = block_ctx->dev_bytenr; - block->logical_bytenr = block_ctx->start; - block->is_metadata = is_metadata; - block->is_iodone = is_iodone; - block->never_written = never_written; - block->mirror_num = mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "New %s%c-block @%llu (%s/%llu/%d)\n", - additional_string, - btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - if (NULL != was_created) - *was_created = 1; - } else { - if (NULL != was_created) - *was_created = 0; - } - - return block; -} - -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char *data) -{ - int num_copies; - int mirror_num; - int ret; - struct btrfsic_block_data_ctx block_ctx; - int match = 0; - - num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, - bytenr, PAGE_SIZE); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, - &block_ctx, mirror_num); - if (ret) { - printk(KERN_INFO "btrfsic:" - " btrfsic_map_block(logical @%llu," - " mirror %d) failed!\n", - (unsigned long long)bytenr, mirror_num); - continue; - } - - if (dev_state->bdev == block_ctx.dev->bdev && - dev_bytenr == block_ctx.dev_bytenr) { - match++; - btrfsic_release_block_ctx(&block_ctx); - break; - } - btrfsic_release_block_ctx(&block_ctx); - } - - if (!match) { - printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," - " buffer->log_bytenr=%llu, submit_bio(bdev=%s," - " phys_bytenr=%llu)!\n", - (unsigned long long)bytenr, dev_state->name, - (unsigned long long)dev_bytenr); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, PAGE_SIZE, - &block_ctx, mirror_num); - if (ret) - continue; - - printk(KERN_INFO "Read logical bytenr @%llu maps to" - " (%s/%llu/%d)\n", - (unsigned long long)bytenr, - block_ctx.dev->name, - (unsigned long long)block_ctx.dev_bytenr, - mirror_num); - } - WARN_ON(1); - } -} - -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev) -{ - struct btrfsic_dev_state *ds; - - ds = btrfsic_dev_state_hashtable_lookup(bdev, - &btrfsic_dev_state_hashtable); - return ds; -} - -int btrfsic_submit_bh(int rw, struct buffer_head *bh) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return submit_bh(rw, bh); - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bh() might also be called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev); - - /* Only called to write the superblock (incl. FLUSH/FUA) */ - if (NULL != dev_state && - (rw & WRITE) && bh->b_size > 0) { - u64 dev_bytenr; - - dev_bytenr = 4096 * bh->b_blocknr; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," - " size=%lu, data=%p, bdev=%p)\n", - rw, (unsigned long)bh->b_blocknr, - (unsigned long long)dev_bytenr, - (unsigned long)bh->b_size, bh->b_data, - bh->b_bdev); - btrfsic_process_written_block(dev_state, dev_bytenr, - bh->b_data, bh->b_size, NULL, - NULL, bh, rw); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n", - rw, bh->b_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - printk(KERN_INFO - "btrfsic_submit_bh(%s) with FLUSH" - " but dummy block already in use" - " (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; - block->orig_bio_bh_private = bh->b_private; - block->orig_bio_bh_end_io.bh = bh->b_end_io; - block->next_in_same_bio = NULL; - bh->b_private = block; - bh->b_end_io = btrfsic_bh_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - return submit_bh(rw, bh); -} - -void btrfsic_submit_bio(int rw, struct bio *bio) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) { - submit_bio(rw, bio); - return; - } - - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); - if (NULL != dev_state && - (rw & WRITE) && NULL != bio->bi_io_vec) { - unsigned int i; - u64 dev_bytenr; - int bio_is_patched; - - dev_bytenr = 512 * bio->bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bio(rw=0x%x, bi_vcnt=%u," - " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, - (unsigned long long)dev_bytenr, - bio->bi_bdev); - - for (i = 0; i < bio->bi_vcnt; i++) { - u8 *mapped_data; - - mapped_data = kmap(bio->bi_io_vec[i].bv_page); - if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE) == - (dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - printk(KERN_INFO - "#%u: page=%p, mapped=%p, len=%u," - " offset=%u\n", - i, bio->bi_io_vec[i].bv_page, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio->bi_io_vec[i].bv_offset); - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_data, - bio->bi_io_vec[i].bv_len, - bio, &bio_is_patched, - NULL, rw); - kunmap(bio->bi_io_vec[i].bv_page); - dev_bytenr += bio->bi_io_vec[i].bv_len; - } - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - printk(KERN_INFO - "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n", - rw, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - printk(KERN_INFO - "btrfsic_submit_bio(%s) with FLUSH" - " but dummy block already in use" - " (ignored)!\n", - dev_state->name); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; - block->orig_bio_bh_private = bio->bi_private; - block->orig_bio_bh_end_io.bio = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } - } - mutex_unlock(&btrfsic_mutex); - - submit_bio(rw, bio); -} - -int btrfsic_mount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask) -{ - int ret; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - state = kzalloc(sizeof(*state), GFP_NOFS); - if (NULL == state) { - printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); - return -1; - } - - if (!btrfsic_is_initialized) { - mutex_init(&btrfsic_mutex); - btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); - btrfsic_is_initialized = 1; - } - mutex_lock(&btrfsic_mutex); - state->root = root; - state->print_mask = print_mask; - state->include_extent_data = including_extent_data; - state->csum_size = 0; - INIT_LIST_HEAD(&state->all_blocks_list); - btrfsic_block_hashtable_init(&state->block_hashtable); - btrfsic_block_link_hashtable_init(&state->block_link_hashtable); - state->max_superblock_generation = 0; - state->latest_superblock = NULL; - - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - char *p; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_alloc(); - if (NULL == ds) { - printk(KERN_INFO - "btrfs check-integrity: kmalloc() failed!\n"); - mutex_unlock(&btrfsic_mutex); - return -1; - } - ds->bdev = device->bdev; - ds->state = state; - bdevname(ds->bdev, ds->name); - ds->name[BDEVNAME_SIZE - 1] = '\0'; - for (p = ds->name; *p != '\0'; p++); - while (p > ds->name && *p != '/') - p--; - if (*p == '/') - p++; - strlcpy(ds->name, p, sizeof(ds->name)); - btrfsic_dev_state_hashtable_add(ds, - &btrfsic_dev_state_hashtable); - } - - ret = btrfsic_process_superblock(state, fs_devices); - if (0 != ret) { - mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(root, fs_devices); - return ret; - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) - btrfsic_dump_database(state); - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) - btrfsic_dump_tree(state); - - mutex_unlock(&btrfsic_mutex); - return 0; -} - -void btrfsic_unmount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices) -{ - struct list_head *elem_all; - struct list_head *tmp_all; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!btrfsic_is_initialized) - return; - - mutex_lock(&btrfsic_mutex); - - state = NULL; - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_hashtable_lookup( - device->bdev, - &btrfsic_dev_state_hashtable); - if (NULL != ds) { - state = ds->state; - btrfsic_dev_state_hashtable_remove(ds); - btrfsic_dev_state_free(ds); - } - } - - if (NULL == state) { - printk(KERN_INFO - "btrfsic: error, cannot find state information" - " on umount!\n"); - mutex_unlock(&btrfsic_mutex); - return; - } - - /* - * Don't care about keeping the lists' state up to date, - * just free all memory that was allocated dynamically. - * Free the blocks and the block_links. - */ - list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { - struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - list_for_each_safe(elem_ref_to, tmp_ref_to, - &b_all->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - - l->ref_cnt--; - if (0 == l->ref_cnt) - btrfsic_block_link_free(l); - } - - if (b_all->is_iodone) - btrfsic_block_free(b_all); - else - printk(KERN_INFO "btrfs: attempt to free %c-block" - " @%llu (%s/%llu/%d) on umount which is" - " not yet iodone!\n", - btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); - } - - mutex_unlock(&btrfsic_mutex); - - kfree(state); -} diff --git a/ANDROID_3.4.5/fs/btrfs/check-integrity.h b/ANDROID_3.4.5/fs/btrfs/check-integrity.h deleted file mode 100644 index 8b59175c..00000000 --- a/ANDROID_3.4.5/fs/btrfs/check-integrity.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#if !defined(__BTRFS_CHECK_INTEGRITY__) -#define __BTRFS_CHECK_INTEGRITY__ - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -#else -#define btrfsic_submit_bh submit_bh -#define btrfsic_submit_bio submit_bio -#endif - -int btrfsic_mount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/compat.h b/ANDROID_3.4.5/fs/btrfs/compat.h deleted file mode 100644 index 7c4503ef..00000000 --- a/ANDROID_3.4.5/fs/btrfs/compat.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _COMPAT_H_ -#define _COMPAT_H_ - -#define btrfs_drop_nlink(inode) drop_nlink(inode) -#define btrfs_inc_nlink(inode) inc_nlink(inode) - -#endif /* _COMPAT_H_ */ diff --git a/ANDROID_3.4.5/fs/btrfs/compression.c b/ANDROID_3.4.5/fs/btrfs/compression.c deleted file mode 100644 index 86eff48d..00000000 --- a/ANDROID_3.4.5/fs/btrfs/compression.c +++ /dev/null @@ -1,1038 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/kernel.h> -#include <linux/bio.h> -#include <linux/buffer_head.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/backing-dev.h> -#include <linux/mpage.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/bit_spinlock.h> -#include <linux/slab.h> -#include "compat.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "volumes.h" -#include "ordered-data.h" -#include "compression.h" -#include "extent_io.h" -#include "extent_map.h" - -struct compressed_bio { - /* number of bios pending for this compressed extent */ - atomic_t pending_bios; - - /* the pages with the compressed data on them */ - struct page **compressed_pages; - - /* inode that owns this data */ - struct inode *inode; - - /* starting offset in the inode for our pages */ - u64 start; - - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; - - /* the compression algorithm for this bio */ - int compress_type; - - /* number of compressed pages in the array */ - unsigned long nr_pages; - - /* IO errors */ - int errors; - int mirror_num; - - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u32 sums; -}; - -static inline int compressed_bio_size(struct btrfs_root *root, - unsigned long disk_size) -{ - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - - return sizeof(struct compressed_bio) + - ((disk_size + root->sectorsize - 1) / root->sectorsize) * - csum_size; -} - -static struct bio *compressed_bio_alloc(struct block_device *bdev, - u64 first_byte, gfp_t gfp_flags) -{ - int nr_vecs; - - nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); -} - -static int check_compressed_csum(struct inode *inode, - struct compressed_bio *cb, - u64 disk_start) -{ - int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *page; - unsigned long i; - char *kaddr; - u32 csum; - u32 *cb_sum = &cb->sums; - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; - - for (i = 0; i < cb->nr_pages; i++) { - page = cb->compressed_pages[i]; - csum = ~(u32)0; - - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); - btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr); - - if (csum != *cb_sum) { - printk(KERN_INFO "btrfs csum failed ino %llu " - "extent %llu csum %u " - "wanted %u mirror %d\n", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)disk_start, - csum, *cb_sum, cb->mirror_num); - ret = -EIO; - goto fail; - } - cb_sum++; - - } - ret = 0; -fail: - return ret; -} - -/* when we finish reading compressed pages from the disk, we - * decompress them and then run the bio end_io routines on the - * decompressed pages (in the inode address space). - * - * This allows the checksumming and other IO error handling routines - * to work normally - * - * The compressed pages are freed here, and it must be run - * in process context - */ -static void end_compressed_bio_read(struct bio *bio, int err) -{ - struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - struct page *page; - unsigned long index; - int ret; - - if (err) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!atomic_dec_and_test(&cb->pending_bios)) - goto out; - - inode = cb->inode; - ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); - if (ret) - goto csum_failed; - - /* ok, we're the last bio for this extent, lets start - * the decompression. - */ - ret = btrfs_decompress_biovec(cb->compress_type, - cb->compressed_pages, - cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, - cb->compressed_len); -csum_failed: - if (ret) - cb->errors = 1; - - /* release the compressed pages */ - index = 0; - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - page_cache_release(page); - } - - /* do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); - } else { - int bio_index = 0; - struct bio_vec *bvec = cb->orig_bio->bi_io_vec; - - /* - * we have verified the checksum already, set page - * checked so the end_io handlers know about it - */ - while (bio_index < cb->orig_bio->bi_vcnt) { - SetPageChecked(bvec->bv_page); - bvec++; - bio_index++; - } - bio_endio(cb->orig_bio, 0); - } - - /* finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); -out: - bio_put(bio); -} - -/* - * Clear the writeback bits on all of the file - * pages for a compressed write - */ -static noinline void end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; - int i; - int ret; - - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } - for (i = 0; i < ret; i++) { - end_page_writeback(pages[i]); - page_cache_release(pages[i]); - } - nr_pages -= ret; - index += ret; - } - /* the inode may be gone now */ -} - -/* - * do the cleanup once all the compressed pages hit the disk. - * This will clear writeback on the file pages and free the compressed - * pages. - * - * This also calls the writeback end hooks for the file pages so that - * metadata and checksums can be updated in the file. - */ -static void end_compressed_bio_write(struct bio *bio, int err) -{ - struct extent_io_tree *tree; - struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - struct page *page; - unsigned long index; - - if (err) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!atomic_dec_and_test(&cb->pending_bios)) - goto out; - - /* ok, we're the last bio for this extent, step one is to - * call back into the FS and do all the end_io operations - */ - inode = cb->inode; - tree = &BTRFS_I(inode)->io_tree; - cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - tree->ops->writepage_end_io_hook(cb->compressed_pages[0], - cb->start, - cb->start + cb->len - 1, - NULL, 1); - cb->compressed_pages[0]->mapping = NULL; - - end_compressed_writeback(inode, cb->start, cb->len); - /* note, our inode could be gone now */ - - /* - * release the compressed pages, these came from alloc_page and - * are not attached to the inode at all - */ - index = 0; - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - page_cache_release(page); - } - - /* finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); -out: - bio_put(bio); -} - -/* - * worker function to build and submit bios for previously compressed pages. - * The corresponding pages in the inode should be marked for writeback - * and the compressed pages should have a reference on them for dropping - * when the IO is complete. - * - * This also checksums the file bytes and gets things ready for - * the end io hooks. - */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, - struct page **compressed_pages, - unsigned long nr_pages) -{ - struct bio *bio = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct compressed_bio *cb; - unsigned long bytes_left; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - int pg_index = 0; - struct page *page; - u64 first_byte = disk_start; - struct block_device *bdev; - int ret; - int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); - cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); - if (!cb) - return -ENOMEM; - atomic_set(&cb->pending_bios, 0); - cb->errors = 0; - cb->inode = inode; - cb->start = start; - cb->len = len; - cb->mirror_num = 0; - cb->compressed_pages = compressed_pages; - cb->compressed_len = compressed_len; - cb->orig_bio = NULL; - cb->nr_pages = nr_pages; - - bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - if(!bio) { - kfree(cb); - return -ENOMEM; - } - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - atomic_inc(&cb->pending_bios); - - /* create and submit bios for the compressed pages */ - bytes_left = compressed_len; - for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { - page = compressed_pages[pg_index]; - page->mapping = inode->i_mapping; - if (bio->bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, - PAGE_CACHE_SIZE, - bio, 0); - else - ret = 0; - - page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - bio_get(bio); - - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - atomic_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); /* -ENOMEM */ - - if (!skip_sum) { - ret = btrfs_csum_one_bio(root, inode, bio, - start, 1); - BUG_ON(ret); /* -ENOMEM */ - } - - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - - bio_put(bio); - - bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); - BUG_ON(!bio); - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); - } - if (bytes_left < PAGE_CACHE_SIZE) { - printk("bytes left %lu compress len %lu nr %lu\n", - bytes_left, cb->compressed_len, cb->nr_pages); - } - bytes_left -= PAGE_CACHE_SIZE; - first_byte += PAGE_CACHE_SIZE; - cond_resched(); - } - bio_get(bio); - - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - BUG_ON(ret); /* -ENOMEM */ - - if (!skip_sum) { - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } - - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - - bio_put(bio); - return 0; -} - -static noinline int add_ra_bio_pages(struct inode *inode, - u64 compressed_end, - struct compressed_bio *cb) -{ - unsigned long end_index; - unsigned long pg_index; - u64 last_offset; - u64 isize = i_size_read(inode); - int ret; - struct page *page; - unsigned long nr_pages = 0; - struct extent_map *em; - struct address_space *mapping = inode->i_mapping; - struct extent_map_tree *em_tree; - struct extent_io_tree *tree; - u64 end; - int misses = 0; - - page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; - last_offset = (page_offset(page) + PAGE_CACHE_SIZE); - em_tree = &BTRFS_I(inode)->extent_tree; - tree = &BTRFS_I(inode)->io_tree; - - if (isize == 0) - return 0; - - end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; - - while (last_offset < compressed_end) { - pg_index = last_offset >> PAGE_CACHE_SHIFT; - - if (pg_index > end_index) - break; - - rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, pg_index); - rcu_read_unlock(); - if (page) { - misses++; - if (misses > 4) - break; - goto next; - } - - page = __page_cache_alloc(mapping_gfp_mask(mapping) & - ~__GFP_FS); - if (!page) - break; - - if (add_to_page_cache_lru(page, mapping, pg_index, - GFP_NOFS)) { - page_cache_release(page); - goto next; - } - - end = last_offset + PAGE_CACHE_SIZE - 1; - /* - * at this point, we have a locked page in the page cache - * for these bytes in the file. But, we have to make - * sure they map to this compressed extent on disk. - */ - set_page_extent_mapped(page); - lock_extent(tree, last_offset, end); - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, last_offset, - PAGE_CACHE_SIZE); - read_unlock(&em_tree->lock); - - if (!em || last_offset < em->start || - (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || - (em->block_start >> 9) != cb->orig_bio->bi_sector) { - free_extent_map(em); - unlock_extent(tree, last_offset, end); - unlock_page(page); - page_cache_release(page); - break; - } - free_extent_map(em); - - if (page->index == end_index) { - char *userpage; - size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); - - if (zero_offset) { - int zeros; - zeros = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, zeros); - flush_dcache_page(page); - kunmap_atomic(userpage); - } - } - - ret = bio_add_page(cb->orig_bio, page, - PAGE_CACHE_SIZE, 0); - - if (ret == PAGE_CACHE_SIZE) { - nr_pages++; - page_cache_release(page); - } else { - unlock_extent(tree, last_offset, end); - unlock_page(page); - page_cache_release(page); - break; - } -next: - last_offset += PAGE_CACHE_SIZE; - } - return 0; -} - -/* - * for a compressed read, the bio we get passed has all the inode pages - * in it. We don't actually do IO on those pages but allocate new ones - * to hold the compressed pages on disk. - * - * bio->bi_sector points to the compressed extent on disk - * bio->bi_io_vec points to all of the inode pages - * bio->bi_vcnt is a count of pages - * - * After the compressed pages are read, we copy the bytes into the - * bio we were passed and then call the bio end_io calls - */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) -{ - struct extent_io_tree *tree; - struct extent_map_tree *em_tree; - struct compressed_bio *cb; - struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; - unsigned long compressed_len; - unsigned long nr_pages; - unsigned long pg_index; - struct page *page; - struct block_device *bdev; - struct bio *comp_bio; - u64 cur_disk_byte = (u64)bio->bi_sector << 9; - u64 em_len; - u64 em_start; - struct extent_map *em; - int ret = -ENOMEM; - u32 *sums; - - tree = &BTRFS_I(inode)->io_tree; - em_tree = &BTRFS_I(inode)->extent_tree; - - /* we need the actual starting offset of this extent in the file */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, - page_offset(bio->bi_io_vec->bv_page), - PAGE_CACHE_SIZE); - read_unlock(&em_tree->lock); - if (!em) - return -EIO; - - compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); - if (!cb) - goto out; - - atomic_set(&cb->pending_bios, 0); - cb->errors = 0; - cb->inode = inode; - cb->mirror_num = mirror_num; - sums = &cb->sums; - - cb->start = em->orig_start; - em_len = em->len; - em_start = em->start; - - free_extent_map(em); - em = NULL; - - cb->len = uncompressed_len; - cb->compressed_len = compressed_len; - cb->compress_type = extent_compress_type(bio_flags); - cb->orig_bio = bio; - - nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; - cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, - GFP_NOFS); - if (!cb->compressed_pages) - goto fail1; - - bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | - __GFP_HIGHMEM); - if (!cb->compressed_pages[pg_index]) - goto fail2; - } - cb->nr_pages = nr_pages; - - add_ra_bio_pages(inode, em_start + em_len, cb); - - /* include any pages we added in add_ra-bio_pages */ - uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; - cb->len = uncompressed_len; - - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); - if (!comp_bio) - goto fail2; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - atomic_inc(&cb->pending_bios); - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - page = cb->compressed_pages[pg_index]; - page->mapping = inode->i_mapping; - page->index = em_start >> PAGE_CACHE_SHIFT; - - if (comp_bio->bi_size) - ret = tree->ops->merge_bio_hook(page, 0, - PAGE_CACHE_SIZE, - comp_bio, 0); - else - ret = 0; - - page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - bio_get(comp_bio); - - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); /* -ENOMEM */ - - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - atomic_inc(&cb->pending_bios); - - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(root, inode, - comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ - } - sums += (comp_bio->bi_size + root->sectorsize - 1) / - root->sectorsize; - - ret = btrfs_map_bio(root, READ, comp_bio, - mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ - - bio_put(comp_bio); - - comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, - GFP_NOFS); - BUG_ON(!comp_bio); - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - - bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); - } - cur_disk_byte += PAGE_CACHE_SIZE; - } - bio_get(comp_bio); - - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); - BUG_ON(ret); /* -ENOMEM */ - - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ - } - - ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - BUG_ON(ret); /* -ENOMEM */ - - bio_put(comp_bio); - return 0; - -fail2: - for (pg_index = 0; pg_index < nr_pages; pg_index++) - free_page((unsigned long)cb->compressed_pages[pg_index]); - - kfree(cb->compressed_pages); -fail1: - kfree(cb); -out: - free_extent_map(em); - return ret; -} - -static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; -static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; -static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; -static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; -static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; - -struct btrfs_compress_op *btrfs_compress_op[] = { - &btrfs_zlib_compress, - &btrfs_lzo_compress, -}; - -void __init btrfs_init_compress(void) -{ - int i; - - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&comp_idle_workspace[i]); - spin_lock_init(&comp_workspace_lock[i]); - atomic_set(&comp_alloc_workspace[i], 0); - init_waitqueue_head(&comp_workspace_wait[i]); - } -} - -/* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. - */ -static struct list_head *find_workspace(int type) -{ - struct list_head *workspace; - int cpus = num_online_cpus(); - int idx = type - 1; - - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; -again: - spin_lock(workspace_lock); - if (!list_empty(idle_workspace)) { - workspace = idle_workspace->next; - list_del(workspace); - (*num_workspace)--; - spin_unlock(workspace_lock); - return workspace; - - } - if (atomic_read(alloc_workspace) > cpus) { - DEFINE_WAIT(wait); - - spin_unlock(workspace_lock); - prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_workspace) > cpus && !*num_workspace) - schedule(); - finish_wait(workspace_wait, &wait); - goto again; - } - atomic_inc(alloc_workspace); - spin_unlock(workspace_lock); - - workspace = btrfs_compress_op[idx]->alloc_workspace(); - if (IS_ERR(workspace)) { - atomic_dec(alloc_workspace); - wake_up(workspace_wait); - } - return workspace; -} - -/* - * put a workspace struct back on the list or free it if we have enough - * idle ones sitting around - */ -static void free_workspace(int type, struct list_head *workspace) -{ - int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; - - spin_lock(workspace_lock); - if (*num_workspace < num_online_cpus()) { - list_add_tail(workspace, idle_workspace); - (*num_workspace)++; - spin_unlock(workspace_lock); - goto wake; - } - spin_unlock(workspace_lock); - - btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_workspace); -wake: - if (waitqueue_active(workspace_wait)) - wake_up(workspace_wait); -} - -/* - * cleanup function for module exit - */ -static void free_workspaces(void) -{ - struct list_head *workspace; - int i; - - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&comp_idle_workspace[i])) { - workspace = comp_idle_workspace[i].next; - list_del(workspace); - btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&comp_alloc_workspace[i]); - } - } -} - -/* - * given an address space and start/len, compress the bytes. - * - * pages are allocated to hold the compressed result and stored - * in 'pages' - * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we - * ran out of room in the pages array or because we cross the - * max_out threshold. - * - * total_out is used to return the total number of compressed bytes - * - * max_out tells us the max number of bytes that we're allowed to - * stuff into pages - */ -int btrfs_compress_pages(int type, struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) -{ - struct list_head *workspace; - int ret; - - workspace = find_workspace(type); - if (IS_ERR(workspace)) - return -1; - - ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, - start, len, pages, - nr_dest_pages, out_pages, - total_in, total_out, - max_out); - free_workspace(type, workspace); - return ret; -} - -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ -int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, int vcnt, size_t srclen) -{ - struct list_head *workspace; - int ret; - - workspace = find_workspace(type); - if (IS_ERR(workspace)) - return -ENOMEM; - - ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, - disk_start, - bvec, vcnt, srclen); - free_workspace(type, workspace); - return ret; -} - -/* - * a less complex decompression routine. Our compressed data fits in a - * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in - */ -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen) -{ - struct list_head *workspace; - int ret; - - workspace = find_workspace(type); - if (IS_ERR(workspace)) - return -ENOMEM; - - ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, - dest_page, start_byte, - srclen, destlen); - - free_workspace(type, workspace); - return ret; -} - -void btrfs_exit_compress(void) -{ - free_workspaces(); -} - -/* - * Copy uncompressed data from working buffer to pages. - * - * buf_start is the byte offset we're of the start of our workspace buffer. - * - * total_out is the last byte of the buffer - */ -int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, - unsigned long total_out, u64 disk_start, - struct bio_vec *bvec, int vcnt, - unsigned long *pg_index, - unsigned long *pg_offset) -{ - unsigned long buf_offset; - unsigned long current_buf_start; - unsigned long start_byte; - unsigned long working_bytes = total_out - buf_start; - unsigned long bytes; - char *kaddr; - struct page *page_out = bvec[*pg_index].bv_page; - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(page_out) - disk_start; - - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - return 1; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - *pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out); - memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); - kunmap_atomic(kaddr); - flush_dcache_page(page_out); - - *pg_offset += bytes; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; - - /* check if we need to pick another page */ - if (*pg_offset == PAGE_CACHE_SIZE) { - (*pg_index)++; - if (*pg_index >= vcnt) - return 0; - - page_out = bvec[*pg_index].bv_page; - *pg_offset = 0; - start_byte = page_offset(page_out) - disk_start; - - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - return 1; - - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; - } - } - } - - return 1; -} diff --git a/ANDROID_3.4.5/fs/btrfs/compression.h b/ANDROID_3.4.5/fs/btrfs/compression.h deleted file mode 100644 index 9afb0a62..00000000 --- a/ANDROID_3.4.5/fs/btrfs/compression.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_COMPRESSION_ -#define __BTRFS_COMPRESSION_ - -void btrfs_init_compress(void); -void btrfs_exit_compress(void); - -int btrfs_compress_pages(int type, struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); -int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, int vcnt, size_t srclen); -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, - unsigned long start_byte, size_t srclen, size_t destlen); -int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, - unsigned long total_out, u64 disk_start, - struct bio_vec *bvec, int vcnt, - unsigned long *pg_index, - unsigned long *pg_offset); - -int btrfs_submit_compressed_write(struct inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, - struct page **compressed_pages, - unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); - -struct btrfs_compress_op { - struct list_head *(*alloc_workspace)(void); - - void (*free_workspace)(struct list_head *workspace); - - int (*compress_pages)(struct list_head *workspace, - struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); - - int (*decompress_biovec)(struct list_head *workspace, - struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen); - - int (*decompress)(struct list_head *workspace, - unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); -}; - -extern struct btrfs_compress_op btrfs_zlib_compress; -extern struct btrfs_compress_op btrfs_lzo_compress; - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/ctree.c b/ANDROID_3.4.5/fs/btrfs/ctree.c deleted file mode 100644 index 4106264f..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ctree.c +++ /dev/null @@ -1,4382 +0,0 @@ -/* - * Copyright (C) 2007,2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "print-tree.h" -#include "locking.h" - -static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int level); -static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *ins_key, - struct btrfs_path *path, int data_size, int extend); -static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, - struct extent_buffer *src, int empty); -static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *dst_buf, - struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); - -struct btrfs_path *btrfs_alloc_path(void) -{ - struct btrfs_path *path; - path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); - return path; -} - -/* - * set all locked nodes in the path to blocking locks. This should - * be done before scheduling - */ -noinline void btrfs_set_path_blocking(struct btrfs_path *p) -{ - int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (!p->nodes[i] || !p->locks[i]) - continue; - btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_READ_LOCK) - p->locks[i] = BTRFS_READ_LOCK_BLOCKING; - else if (p->locks[i] == BTRFS_WRITE_LOCK) - p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING; - } -} - -/* - * reset all the locked nodes in the patch to spinning locks. - * - * held is used to keep lockdep happy, when lockdep is enabled - * we set held to a blocking lock before we go around and - * retake all the spinlocks in the path. You can safely use NULL - * for held - */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw) -{ - int i; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* lockdep really cares that we take all of these spinlocks - * in the right order. If any of the locks in the path are not - * currently blocking, it is going to complain. So, make really - * really sure by forcing the path to blocking before we clear - * the path blocking. - */ - if (held) { - btrfs_set_lock_blocking_rw(held, held_rw); - if (held_rw == BTRFS_WRITE_LOCK) - held_rw = BTRFS_WRITE_LOCK_BLOCKING; - else if (held_rw == BTRFS_READ_LOCK) - held_rw = BTRFS_READ_LOCK_BLOCKING; - } - btrfs_set_path_blocking(p); -#endif - - for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { - if (p->nodes[i] && p->locks[i]) { - btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]); - if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING) - p->locks[i] = BTRFS_WRITE_LOCK; - else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING) - p->locks[i] = BTRFS_READ_LOCK; - } - } - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - if (held) - btrfs_clear_lock_blocking_rw(held, held_rw); -#endif -} - -/* this also releases the path */ -void btrfs_free_path(struct btrfs_path *p) -{ - if (!p) - return; - btrfs_release_path(p); - kmem_cache_free(btrfs_path_cachep, p); -} - -/* - * path release drops references on the extent buffers in the path - * and it drops any locks held by this path - * - * It is safe to call this on paths that no locks or extent buffers held. - */ -noinline void btrfs_release_path(struct btrfs_path *p) -{ - int i; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - p->slots[i] = 0; - if (!p->nodes[i]) - continue; - if (p->locks[i]) { - btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]); - p->locks[i] = 0; - } - free_extent_buffer(p->nodes[i]); - p->nodes[i] = NULL; - } -} - -/* - * safely gets a reference on the root node of a tree. A lock - * is not taken, so a concurrent writer may put a different node - * at the root of the tree. See btrfs_lock_root_node for the - * looping required. - * - * The extent buffer returned by this has a reference taken, so - * it won't disappear. It may stop being the root of the tree - * at any time because there are no locks held. - */ -struct extent_buffer *btrfs_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - rcu_read_lock(); - eb = rcu_dereference(root->node); - - /* - * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do - * the inc_not_zero dance and if it doesn't work then - * synchronize_rcu and try again. - */ - if (atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - break; - } - rcu_read_unlock(); - synchronize_rcu(); - } - return eb; -} - -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_lock(eb); - if (eb == root->node) - break; - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - -/* loop around taking references on and locking the root node of the - * tree until you end up with a lock on the root. A locked buffer - * is returned, with a reference held. - */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) -{ - struct extent_buffer *eb; - - while (1) { - eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); - if (eb == root->node) - break; - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - } - return eb; -} - -/* cowonly root (everything not a reference counted cow subvolume), just get - * put onto a simple dirty list. transaction.c walks this to make sure they - * get properly updated on disk. - */ -static void add_root_to_dirty_list(struct btrfs_root *root) -{ - spin_lock(&root->fs_info->trans_lock); - if (root->track_dirty && list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); - } - spin_unlock(&root->fs_info->trans_lock); -} - -/* - * used by snapshot creation to make a copy of a root for a tree with - * a given objectid. The buffer with the new root node is returned in - * cow_ret, and this func returns zero on success or a negative error code. - */ -int btrfs_copy_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer **cow_ret, u64 new_root_objectid) -{ - struct extent_buffer *cow; - int ret = 0; - int level; - struct btrfs_disk_key disk_key; - - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); - - level = btrfs_header_level(buf); - if (level == 0) - btrfs_item_key(buf, &disk_key, 0); - else - btrfs_node_key(buf, &disk_key, 0); - - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0, 1); - if (IS_ERR(cow)) - return PTR_ERR(cow); - - copy_extent_buffer(cow, buf, 0, 0, cow->len); - btrfs_set_header_bytenr(cow, cow->start); - btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | - BTRFS_HEADER_FLAG_RELOC); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); - else - btrfs_set_header_owner(cow, new_root_objectid); - - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), - BTRFS_FSID_SIZE); - - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); - - if (ret) - return ret; - - btrfs_mark_buffer_dirty(cow); - *cow_ret = cow; - return 0; -} - -/* - * check if the tree block can be shared by multiple trees - */ -int btrfs_block_can_be_shared(struct btrfs_root *root, - struct extent_buffer *buf) -{ - /* - * Tree blocks not in refernece counted trees and tree roots - * are never shared. If a block was allocated after the last - * snapshot and the block was not allocated by tree relocation, - * we know the block is not shared. - */ - if (root->ref_cows && - buf != root->node && buf != root->commit_root && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 1; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root->ref_cows && - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - return 1; -#endif - return 0; -} - -static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *cow, - int *last_ref) -{ - u64 refs; - u64 owner; - u64 flags; - u64 new_flags = 0; - int ret; - - /* - * Backrefs update rules: - * - * Always use full backrefs for extent pointers in tree block - * allocated by tree relocation. - * - * If a shared tree block is no longer referenced by its owner - * tree (btrfs_header_owner(buf) == root->root_key.objectid), - * use full backrefs for extent pointers in tree block. - * - * If a tree block is been relocating - * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), - * use full backrefs for extent pointers in tree block. - * The reason for this is some operations (such as drop tree) - * are only allowed for blocks use full backrefs. - */ - - if (btrfs_block_can_be_shared(root, buf)) { - ret = btrfs_lookup_extent_info(trans, root, buf->start, - buf->len, &refs, &flags); - if (ret) - return ret; - if (refs == 0) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret); - return ret; - } - } else { - refs = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; - else - flags = 0; - } - - owner = btrfs_header_owner(buf); - BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && - !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - - if (refs > 1) { - if ((owner == root->root_key.objectid || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && - !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); - BUG_ON(ret); /* -ENOMEM */ - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_inc_ref(trans, root, cow, 1, 1); - BUG_ON(ret); /* -ENOMEM */ - } - new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else { - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - } - if (new_flags != 0) { - ret = btrfs_set_disk_extent_flags(trans, root, - buf->start, - buf->len, - new_flags, 0); - if (ret) - return ret; - } - } else { - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); - else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, buf, 1, 1); - BUG_ON(ret); /* -ENOMEM */ - } - clean_tree_block(trans, root, buf); - *last_ref = 1; - } - return 0; -} - -/* - * does the dirty work in cow of a single block. The parent block (if - * supplied) is updated to point to the new cow copy. The new buffer is marked - * dirty and returned locked. If you modify the block it needs to be marked - * dirty again. - * - * search_start -- an allocation hint for the new block - * - * empty_size -- a hint that you plan on doing more cow. This is the size in - * bytes the allocator should try to find free next to the block it returns. - * This is just a hint and may be ignored by the allocator. - */ -static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) -{ - struct btrfs_disk_key disk_key; - struct extent_buffer *cow; - int level, ret; - int last_ref = 0; - int unlock_orig = 0; - u64 parent_start; - - if (*cow_ret == buf) - unlock_orig = 1; - - btrfs_assert_tree_locked(buf); - - WARN_ON(root->ref_cows && trans->transid != - root->fs_info->running_transaction->transid); - WARN_ON(root->ref_cows && trans->transid != root->last_trans); - - level = btrfs_header_level(buf); - - if (level == 0) - btrfs_item_key(buf, &disk_key, 0); - else - btrfs_node_key(buf, &disk_key, 0); - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent) - parent_start = parent->start; - else - parent_start = 0; - } else - parent_start = 0; - - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size, 1); - if (IS_ERR(cow)) - return PTR_ERR(cow); - - /* cow is set to blocking by btrfs_init_new_buffer */ - - copy_extent_buffer(cow, buf, 0, 0, cow->len); - btrfs_set_header_bytenr(cow, cow->start); - btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | - BTRFS_HEADER_FLAG_RELOC); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); - else - btrfs_set_header_owner(cow, root->root_key.objectid); - - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), - BTRFS_FSID_SIZE); - - ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - - if (root->ref_cows) - btrfs_reloc_cow_block(trans, root, buf, cow); - - if (buf == root->node) { - WARN_ON(parent && parent != buf); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) - parent_start = buf->start; - else - parent_start = 0; - - extent_buffer_get(cow); - rcu_assign_pointer(root->node, cow); - - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); - free_extent_buffer(buf); - add_root_to_dirty_list(root); - } else { - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - parent_start = parent->start; - else - parent_start = 0; - - WARN_ON(trans->transid != btrfs_header_generation(parent)); - btrfs_set_node_blockptr(parent, parent_slot, - cow->start); - btrfs_set_node_ptr_generation(parent, parent_slot, - trans->transid); - btrfs_mark_buffer_dirty(parent); - btrfs_free_tree_block(trans, root, buf, parent_start, - last_ref, 1); - } - if (unlock_orig) - btrfs_tree_unlock(buf); - free_extent_buffer_stale(buf); - btrfs_mark_buffer_dirty(cow); - *cow_ret = cow; - return 0; -} - -static inline int should_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) -{ - /* ensure we can see the force_cow */ - smp_rmb(); - - /* - * We do not need to cow a block if - * 1) this block is not created or changed in this transaction; - * 2) this block does not belong to TREE_RELOC tree; - * 3) the root is not forced COW. - * - * What is forced COW: - * when we create snapshot during commiting the transaction, - * after we've finished coping src root, we must COW the shared - * block to ensure the metadata consistency. - */ - if (btrfs_header_generation(buf) == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && - !root->force_cow) - return 0; - return 1; -} - -/* - * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than - * once per transaction, as long as it hasn't been written yet - */ -noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) -{ - u64 search_start; - int ret; - - if (trans->transaction != root->fs_info->running_transaction) { - printk(KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long) - root->fs_info->running_transaction->transid); - WARN_ON(1); - } - if (trans->transid != root->fs_info->generation) { - printk(KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } - - if (!should_cow_block(trans, root, buf)) { - *cow_ret = buf; - return 0; - } - - search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); - - if (parent) - btrfs_set_lock_blocking(parent); - btrfs_set_lock_blocking(buf); - - ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); - - trace_btrfs_cow_block(root, buf, *cow_ret); - - return ret; -} - -/* - * helper function for defrag to decide if two blocks pointed to by a - * node are actually close by - */ -static int close_blocks(u64 blocknr, u64 other, u32 blocksize) -{ - if (blocknr < other && other - (blocknr + blocksize) < 32768) - return 1; - if (blocknr > other && blocknr - (other + blocksize) < 32768) - return 1; - return 0; -} - -/* - * compare two keys in a memcmp fashion - */ -static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) -{ - struct btrfs_key k1; - - btrfs_disk_key_to_cpu(&k1, disk); - - return btrfs_comp_cpu_keys(&k1, k2); -} - -/* - * same as comp_keys only with two btrfs_key's - */ -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) -{ - if (k1->objectid > k2->objectid) - return 1; - if (k1->objectid < k2->objectid) - return -1; - if (k1->type > k2->type) - return 1; - if (k1->type < k2->type) - return -1; - if (k1->offset > k2->offset) - return 1; - if (k1->offset < k2->offset) - return -1; - return 0; -} - -/* - * this is used by the defrag code to go through all the - * leaves pointed to by a node and reallocate them so that - * disk order is close to key order - */ -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, - struct btrfs_key *progress) -{ - struct extent_buffer *cur; - u64 blocknr; - u64 gen; - u64 search_start = *last_ret; - u64 last_block = 0; - u64 other; - u32 parent_nritems; - int end_slot; - int i; - int err = 0; - int parent_level; - int uptodate; - u32 blocksize; - int progress_passed = 0; - struct btrfs_disk_key disk_key; - - parent_level = btrfs_header_level(parent); - if (cache_only && parent_level != 1) - return 0; - - if (trans->transaction != root->fs_info->running_transaction) - WARN_ON(1); - if (trans->transid != root->fs_info->generation) - WARN_ON(1); - - parent_nritems = btrfs_header_nritems(parent); - blocksize = btrfs_level_size(root, parent_level - 1); - end_slot = parent_nritems; - - if (parent_nritems == 1) - return 0; - - btrfs_set_lock_blocking(parent); - - for (i = start_slot; i < end_slot; i++) { - int close = 1; - - btrfs_node_key(parent, &disk_key, i); - if (!progress_passed && comp_keys(&disk_key, progress) < 0) - continue; - - progress_passed = 1; - blocknr = btrfs_node_blockptr(parent, i); - gen = btrfs_node_ptr_generation(parent, i); - if (last_block == 0) - last_block = blocknr; - - if (i > 0) { - other = btrfs_node_blockptr(parent, i - 1); - close = close_blocks(blocknr, other, blocksize); - } - if (!close && i < end_slot - 2) { - other = btrfs_node_blockptr(parent, i + 1); - close = close_blocks(blocknr, other, blocksize); - } - if (close) { - last_block = blocknr; - continue; - } - - cur = btrfs_find_tree_block(root, blocknr, blocksize); - if (cur) - uptodate = btrfs_buffer_uptodate(cur, gen, 0); - else - uptodate = 0; - if (!cur || !uptodate) { - if (cache_only) { - free_extent_buffer(cur); - continue; - } - if (!cur) { - cur = read_tree_block(root, blocknr, - blocksize, gen); - if (!cur) - return -EIO; - } else if (!uptodate) { - btrfs_read_buffer(cur, gen); - } - } - if (search_start == 0) - search_start = last_block; - - btrfs_tree_lock(cur); - btrfs_set_lock_blocking(cur); - err = __btrfs_cow_block(trans, root, cur, parent, i, - &cur, search_start, - min(16 * blocksize, - (end_slot - i) * blocksize)); - if (err) { - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - break; - } - search_start = cur->start; - last_block = cur->start; - *last_ret = search_start; - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - } - return err; -} - -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(root); - return btrfs_item_offset_nr(leaf, nr - 1); -} - - -/* - * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. There are 'max' items in p. - * - * the slot in the array is returned via slot, and it points to - * the place where you would insert key if it is not found in - * the array. - * - * slot may point to max if the key is bigger than all of the keys - */ -static noinline int generic_bin_search(struct extent_buffer *eb, - unsigned long p, - int item_size, struct btrfs_key *key, - int max, int *slot) -{ - int low = 0; - int high = max; - int mid; - int ret; - struct btrfs_disk_key *tmp = NULL; - struct btrfs_disk_key unaligned; - unsigned long offset; - char *kaddr = NULL; - unsigned long map_start = 0; - unsigned long map_len = 0; - int err; - - while (low < high) { - mid = (low + high) / 2; - offset = p + mid * item_size; - - if (!kaddr || offset < map_start || - (offset + sizeof(struct btrfs_disk_key)) > - map_start + map_len) { - - err = map_private_extent_buffer(eb, offset, - sizeof(struct btrfs_disk_key), - &kaddr, &map_start, &map_len); - - if (!err) { - tmp = (struct btrfs_disk_key *)(kaddr + offset - - map_start); - } else { - read_extent_buffer(eb, &unaligned, - offset, sizeof(unaligned)); - tmp = &unaligned; - } - - } else { - tmp = (struct btrfs_disk_key *)(kaddr + offset - - map_start); - } - ret = comp_keys(tmp, key); - - if (ret < 0) - low = mid + 1; - else if (ret > 0) - high = mid; - else { - *slot = mid; - return 0; - } - } - *slot = low; - return 1; -} - -/* - * simple bin_search frontend that does the right thing for - * leaves vs nodes - */ -static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot) -{ - if (level == 0) { - return generic_bin_search(eb, - offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), - key, btrfs_header_nritems(eb), - slot); - } else { - return generic_bin_search(eb, - offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), - key, btrfs_header_nritems(eb), - slot); - } - return -1; -} - -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot) -{ - return bin_search(eb, key, level, slot); -} - -static void root_add_used(struct btrfs_root *root, u32 size) -{ - spin_lock(&root->accounting_lock); - btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) + size); - spin_unlock(&root->accounting_lock); -} - -static void root_sub_used(struct btrfs_root *root, u32 size) -{ - spin_lock(&root->accounting_lock); - btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) - size); - spin_unlock(&root->accounting_lock); -} - -/* given a node and slot number, this reads the blocks it points to. The - * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. - */ -static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, - struct extent_buffer *parent, int slot) -{ - int level = btrfs_header_level(parent); - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; - - BUG_ON(level == 0); - - return read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), - btrfs_node_ptr_generation(parent, slot)); -} - -/* - * node level balancing, used to make sure nodes are in proper order for - * item deletion. We balance from the top down, so we have to make sure - * that a deletion won't leave an node completely empty later on. - */ -static noinline int balance_level(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - struct extent_buffer *right = NULL; - struct extent_buffer *mid; - struct extent_buffer *left = NULL; - struct extent_buffer *parent = NULL; - int ret = 0; - int wret; - int pslot; - int orig_slot = path->slots[level]; - u64 orig_ptr; - - if (level == 0) - return 0; - - mid = path->nodes[level]; - - WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK && - path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING); - WARN_ON(btrfs_header_generation(mid) != trans->transid); - - orig_ptr = btrfs_node_blockptr(mid, orig_slot); - - if (level < BTRFS_MAX_LEVEL - 1) { - parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; - } - - /* - * deal with the case where there is only one pointer in the root - * by promoting the node below to a root - */ - if (!parent) { - struct extent_buffer *child; - - if (btrfs_header_nritems(mid) != 1) - return 0; - - /* promote the child to a root */ - child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret); - goto enospc; - } - - btrfs_tree_lock(child); - btrfs_set_lock_blocking(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); - if (ret) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - goto enospc; - } - - rcu_assign_pointer(root->node, child); - - add_root_to_dirty_list(root); - btrfs_tree_unlock(child); - - path->locks[level] = 0; - path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); - btrfs_tree_unlock(mid); - /* once for the path */ - free_extent_buffer(mid); - - root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - /* once for the root ptr */ - free_extent_buffer_stale(mid); - return 0; - } - if (btrfs_header_nritems(mid) > - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) - return 0; - - btrfs_header_nritems(mid); - - left = read_node_slot(root, parent, pslot - 1); - if (left) { - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); - if (wret) { - ret = wret; - goto enospc; - } - } - right = read_node_slot(root, parent, pslot + 1); - if (right) { - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); - if (wret) { - ret = wret; - goto enospc; - } - } - - /* first, try to make some room in the middle buffer */ - if (left) { - orig_slot += btrfs_header_nritems(left); - wret = push_node_left(trans, root, left, mid, 1); - if (wret < 0) - ret = wret; - btrfs_header_nritems(mid); - } - - /* - * then try to empty the right most buffer into the middle - */ - if (right) { - wret = push_node_left(trans, root, mid, right, 1); - if (wret < 0 && wret != -ENOSPC) - ret = wret; - if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); - btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1); - root_sub_used(root, right->len); - btrfs_free_tree_block(trans, root, right, 0, 1, 0); - free_extent_buffer_stale(right); - right = NULL; - } else { - struct btrfs_disk_key right_key; - btrfs_node_key(right, &right_key, 0); - btrfs_set_node_key(parent, &right_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); - } - } - if (btrfs_header_nritems(mid) == 1) { - /* - * we're not allowed to leave a node with one item in the - * tree during a delete. A deletion from lower in the tree - * could try to delete the only pointer in this node. - * So, pull some keys from the left. - * There has to be a left pointer at this point because - * otherwise we would have pulled some pointers from the - * right - */ - if (!left) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret); - goto enospc; - } - wret = balance_node_right(trans, root, mid, left); - if (wret < 0) { - ret = wret; - goto enospc; - } - if (wret == 1) { - wret = push_node_left(trans, root, left, mid, 1); - if (wret < 0) - ret = wret; - } - BUG_ON(wret == 1); - } - if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); - btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot); - root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, root, mid, 0, 1, 0); - free_extent_buffer_stale(mid); - mid = NULL; - } else { - /* update the parent key to reflect our changes */ - struct btrfs_disk_key mid_key; - btrfs_node_key(mid, &mid_key, 0); - btrfs_set_node_key(parent, &mid_key, pslot); - btrfs_mark_buffer_dirty(parent); - } - - /* update the path */ - if (left) { - if (btrfs_header_nritems(left) > orig_slot) { - extent_buffer_get(left); - /* left was locked after cow */ - path->nodes[level] = left; - path->slots[level + 1] -= 1; - path->slots[level] = orig_slot; - if (mid) { - btrfs_tree_unlock(mid); - free_extent_buffer(mid); - } - } else { - orig_slot -= btrfs_header_nritems(left); - path->slots[level] = orig_slot; - } - } - /* double check we haven't messed things up */ - if (orig_ptr != - btrfs_node_blockptr(path->nodes[level], path->slots[level])) - BUG(); -enospc: - if (right) { - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - if (left) { - if (path->nodes[level] != left) - btrfs_tree_unlock(left); - free_extent_buffer(left); - } - return ret; -} - -/* Node balancing for insertion. Here we only split or push nodes around - * when they are completely full. This is also done top down, so we - * have to be pessimistic. - */ -static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - struct extent_buffer *right = NULL; - struct extent_buffer *mid; - struct extent_buffer *left = NULL; - struct extent_buffer *parent = NULL; - int ret = 0; - int wret; - int pslot; - int orig_slot = path->slots[level]; - - if (level == 0) - return 1; - - mid = path->nodes[level]; - WARN_ON(btrfs_header_generation(mid) != trans->transid); - - if (level < BTRFS_MAX_LEVEL - 1) { - parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; - } - - if (!parent) - return 1; - - left = read_node_slot(root, parent, pslot - 1); - - /* first, try to make some room in the middle buffer */ - if (left) { - u32 left_nr; - - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - - left_nr = btrfs_header_nritems(left); - if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { - wret = 1; - } else { - ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); - if (ret) - wret = 1; - else { - wret = push_node_left(trans, root, - left, mid, 0); - } - } - if (wret < 0) - ret = wret; - if (wret == 0) { - struct btrfs_disk_key disk_key; - orig_slot += left_nr; - btrfs_node_key(mid, &disk_key, 0); - btrfs_set_node_key(parent, &disk_key, pslot); - btrfs_mark_buffer_dirty(parent); - if (btrfs_header_nritems(left) > orig_slot) { - path->nodes[level] = left; - path->slots[level + 1] -= 1; - path->slots[level] = orig_slot; - btrfs_tree_unlock(mid); - free_extent_buffer(mid); - } else { - orig_slot -= - btrfs_header_nritems(left); - path->slots[level] = orig_slot; - btrfs_tree_unlock(left); - free_extent_buffer(left); - } - return 0; - } - btrfs_tree_unlock(left); - free_extent_buffer(left); - } - right = read_node_slot(root, parent, pslot + 1); - - /* - * then try to empty the right most buffer into the middle - */ - if (right) { - u32 right_nr; - - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - right_nr = btrfs_header_nritems(right); - if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { - wret = 1; - } else { - ret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, - &right); - if (ret) - wret = 1; - else { - wret = balance_node_right(trans, root, - right, mid); - } - } - if (wret < 0) - ret = wret; - if (wret == 0) { - struct btrfs_disk_key disk_key; - - btrfs_node_key(right, &disk_key, 0); - btrfs_set_node_key(parent, &disk_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); - - if (btrfs_header_nritems(mid) <= orig_slot) { - path->nodes[level] = right; - path->slots[level + 1] += 1; - path->slots[level] = orig_slot - - btrfs_header_nritems(mid); - btrfs_tree_unlock(mid); - free_extent_buffer(mid); - } else { - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - return 0; - } - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - return 1; -} - -/* - * readahead one full node of leaves, finding things that are close - * to the block in 'slot', and triggering ra on them. - */ -static void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, - int level, int slot, u64 objectid) -{ - struct extent_buffer *node; - struct btrfs_disk_key disk_key; - u32 nritems; - u64 search; - u64 target; - u64 nread = 0; - u64 gen; - int direction = path->reada; - struct extent_buffer *eb; - u32 nr; - u32 blocksize; - u32 nscan = 0; - - if (level != 1) - return; - - if (!path->nodes[level]) - return; - - node = path->nodes[level]; - - search = btrfs_node_blockptr(node, slot); - blocksize = btrfs_level_size(root, level - 1); - eb = btrfs_find_tree_block(root, search, blocksize); - if (eb) { - free_extent_buffer(eb); - return; - } - - target = search; - - nritems = btrfs_header_nritems(node); - nr = slot; - - while (1) { - if (direction < 0) { - if (nr == 0) - break; - nr--; - } else if (direction > 0) { - nr++; - if (nr >= nritems) - break; - } - if (path->reada < 0 && objectid) { - btrfs_node_key(node, &disk_key, nr); - if (btrfs_disk_key_objectid(&disk_key) != objectid) - break; - } - search = btrfs_node_blockptr(node, nr); - if ((search <= target && target - search <= 65536) || - (search > target && search - target <= 65536)) { - gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize, gen); - nread += blocksize; - } - nscan++; - if ((nread > 65536 || nscan > 32)) - break; - } -} - -/* - * returns -EAGAIN if it had to drop the path, or zero if everything was in - * cache - */ -static noinline int reada_for_balance(struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - int slot; - int nritems; - struct extent_buffer *parent; - struct extent_buffer *eb; - u64 gen; - u64 block1 = 0; - u64 block2 = 0; - int ret = 0; - int blocksize; - - parent = path->nodes[level + 1]; - if (!parent) - return 0; - - nritems = btrfs_header_nritems(parent); - slot = path->slots[level + 1]; - blocksize = btrfs_level_size(root, level); - - if (slot > 0) { - block1 = btrfs_node_blockptr(parent, slot - 1); - gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1, blocksize); - /* - * if we get -eagain from btrfs_buffer_uptodate, we - * don't want to return eagain here. That will loop - * forever - */ - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block1 = 0; - free_extent_buffer(eb); - } - if (slot + 1 < nritems) { - block2 = btrfs_node_blockptr(parent, slot + 1); - gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2, blocksize); - if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) - block2 = 0; - free_extent_buffer(eb); - } - if (block1 || block2) { - ret = -EAGAIN; - - /* release the whole path */ - btrfs_release_path(path); - - /* read the blocks */ - if (block1) - readahead_tree_block(root, block1, blocksize, 0); - if (block2) - readahead_tree_block(root, block2, blocksize, 0); - - if (block1) { - eb = read_tree_block(root, block1, blocksize, 0); - free_extent_buffer(eb); - } - if (block2) { - eb = read_tree_block(root, block2, blocksize, 0); - free_extent_buffer(eb); - } - } - return ret; -} - - -/* - * when we walk down the tree, it is usually safe to unlock the higher layers - * in the tree. The exceptions are when our path goes through slot 0, because - * operations on the tree might require changing key pointers higher up in the - * tree. - * - * callers might also have set path->keep_locks, which tells this code to keep - * the lock if the path points to the last slot in the block. This is part of - * walking through the tree, and selecting the next slot in the higher block. - * - * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so - * if lowest_unlock is 1, level 0 won't be unlocked - */ -static noinline void unlock_up(struct btrfs_path *path, int level, - int lowest_unlock, int min_write_lock_level, - int *write_lock_level) -{ - int i; - int skip_level = level; - int no_skips = 0; - struct extent_buffer *t; - - for (i = level; i < BTRFS_MAX_LEVEL; i++) { - if (!path->nodes[i]) - break; - if (!path->locks[i]) - break; - if (!no_skips && path->slots[i] == 0) { - skip_level = i + 1; - continue; - } - if (!no_skips && path->keep_locks) { - u32 nritems; - t = path->nodes[i]; - nritems = btrfs_header_nritems(t); - if (nritems < 1 || path->slots[i] >= nritems - 1) { - skip_level = i + 1; - continue; - } - } - if (skip_level < i && i >= lowest_unlock) - no_skips = 1; - - t = path->nodes[i]; - if (i >= lowest_unlock && i > skip_level && path->locks[i]) { - btrfs_tree_unlock_rw(t, path->locks[i]); - path->locks[i] = 0; - if (write_lock_level && - i > min_write_lock_level && - i <= *write_lock_level) { - *write_lock_level = i - 1; - } - } - } -} - -/* - * This releases any locks held in the path starting at level and - * going all the way up to the root. - * - * btrfs_search_slot will keep the lock held on higher nodes in a few - * corner cases, such as COW of the block at slot zero in the node. This - * ignores those rules, and it should only be called when there are no - * more updates to be done higher up in the tree. - */ -noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) -{ - int i; - - if (path->keep_locks) - return; - - for (i = level; i < BTRFS_MAX_LEVEL; i++) { - if (!path->nodes[i]) - continue; - if (!path->locks[i]) - continue; - btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); - path->locks[i] = 0; - } -} - -/* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. - * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. - */ -static int -read_block_for_search(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer **eb_ret, int level, int slot, - struct btrfs_key *key) -{ - u64 blocknr; - u64 gen; - u32 blocksize; - struct extent_buffer *b = *eb_ret; - struct extent_buffer *tmp; - int ret; - - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); - - tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp) { - /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { - if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - /* - * we found an up to date block without - * sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; - } - /* the pages were up to date, but we failed - * the generation number check. Do a full - * read for the generation number that is correct. - * We must do this without dropping locks so - * we can trust our generation number - */ - free_extent_buffer(tmp); - btrfs_set_path_blocking(p); - - /* now we're allowed to do a blocking uptodate check */ - tmp = read_tree_block(root, blocknr, blocksize, gen); - if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { - *eb_ret = tmp; - return 0; - } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; - } - } - - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. - */ - btrfs_unlock_up_safe(p, level + 1); - btrfs_set_path_blocking(p); - - free_extent_buffer(tmp); - if (p->reada) - reada_for_search(root, p, level, slot, key->objectid); - - btrfs_release_path(p); - - ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, 0); - if (tmp) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!btrfs_buffer_uptodate(tmp, 0, 0)) - ret = -EIO; - free_extent_buffer(tmp); - } - return ret; -} - -/* - * helper function for btrfs_search_slot. This does all of the checks - * for node-level blocks and does any balancing required based on - * the ins_len. - * - * If no extra work was required, zero is returned. If we had to - * drop the path, -EAGAIN is returned and btrfs_search_slot must - * start over - */ -static int -setup_nodes_for_search(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *p, - struct extent_buffer *b, int level, int ins_len, - int *write_lock_level) -{ - int ret; - if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; - - if (*write_lock_level < level + 1) { - *write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); - - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - } else if (ins_len < 0 && btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { - int sret; - - if (*write_lock_level < level + 1) { - *write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL, 0); - - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - if (!b) { - btrfs_release_path(p); - goto again; - } - BUG_ON(btrfs_header_nritems(b) == 1); - } - return 0; - -again: - ret = -EAGAIN; -done: - return ret; -} - -/* - * look for key in the tree. path is filled in with nodes along the way - * if key is found, we return zero and you can find the item in the leaf - * level of the path (level 0) - * - * If the key isn't found, the path points to the slot where it should - * be inserted, and 1 is returned. If there are other errors during the - * search a negative error number is returned. - * - * if ins_len > 0, nodes and leaves will be split as we walk down the - * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if - * possible) - */ -int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_path *p, int - ins_len, int cow) -{ - struct extent_buffer *b; - int slot; - int ret; - int err; - int level; - int lowest_unlock = 1; - int root_lock; - /* everything at write_lock_level or lower must be write locked */ - int write_lock_level = 0; - u8 lowest_level = 0; - int min_write_lock_level; - - lowest_level = p->lowest_level; - WARN_ON(lowest_level && ins_len > 0); - WARN_ON(p->nodes[0] != NULL); - - if (ins_len < 0) { - lowest_unlock = 2; - - /* when we are removing items, we might have to go up to level - * two as we update tree pointers Make sure we keep write - * for those levels as well - */ - write_lock_level = 2; - } else if (ins_len > 0) { - /* - * for inserting items, make sure we have a write lock on - * level 1 so we can update keys - */ - write_lock_level = 1; - } - - if (!cow) - write_lock_level = -1; - - if (cow && (p->keep_locks || p->lowest_level)) - write_lock_level = BTRFS_MAX_LEVEL; - - min_write_lock_level = write_lock_level; - -again: - /* - * we try very hard to do read locks on the root - */ - root_lock = BTRFS_READ_LOCK; - level = 0; - if (p->search_commit_root) { - /* - * the commit roots are read only - * so we always do read locks - */ - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (!p->skip_locking) - btrfs_tree_read_lock(b); - } else { - if (p->skip_locking) { - b = btrfs_root_node(root); - level = btrfs_header_level(b); - } else { - /* we don't know the level of the root node - * until we actually have it read locked - */ - b = btrfs_read_lock_root_node(root); - level = btrfs_header_level(b); - if (level <= write_lock_level) { - /* whoops, must trade for write lock */ - btrfs_tree_read_unlock(b); - free_extent_buffer(b); - b = btrfs_lock_root_node(root); - root_lock = BTRFS_WRITE_LOCK; - - /* the level might have changed, check again */ - level = btrfs_header_level(b); - } - } - } - p->nodes[level] = b; - if (!p->skip_locking) - p->locks[level] = root_lock; - - while (b) { - level = btrfs_header_level(b); - - /* - * setup the path here so we can release it under lock - * contention with the cow code - */ - if (cow) { - /* - * if we don't really need to cow this block - * then we don't want to set the path blocking, - * so we test it here - */ - if (!should_cow_block(trans, root, b)) - goto cow_done; - - btrfs_set_path_blocking(p); - - /* - * must have write locks on this node and the - * parent - */ - if (level + 1 > write_lock_level) { - write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } - - err = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); - if (err) { - ret = err; - goto done; - } - } -cow_done: - BUG_ON(!cow && ins_len); - - p->nodes[level] = b; - btrfs_clear_path_blocking(p, NULL, 0); - - /* - * we have a lock on b and as long as we aren't changing - * the tree, there is no way to for the items in b to change. - * It is safe to drop the lock on our parent before we - * go through the expensive btree search on b. - * - * If cow is true, then we might be changing slot zero, - * which may require changing the parent. So, we can't - * drop the lock until after we know which slot we're - * operating on. - */ - if (!cow) - btrfs_unlock_up_safe(p, level + 1); - - ret = bin_search(b, key, level, &slot); - - if (level != 0) { - int dec = 0; - if (ret && slot > 0) { - dec = 1; - slot -= 1; - } - p->slots[level] = slot; - err = setup_nodes_for_search(trans, root, p, b, level, - ins_len, &write_lock_level); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - - /* - * slot 0 is special, if we change the key - * we have to update the parent pointer - * which means we must have a write lock - * on the parent - */ - if (slot == 0 && cow && - write_lock_level < level + 1) { - write_lock_level = level + 1; - btrfs_release_path(p); - goto again; - } - - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); - - if (level == lowest_level) { - if (dec) - p->slots[level]++; - goto done; - } - - err = read_block_for_search(trans, root, p, - &b, level, slot, key); - if (err == -EAGAIN) - goto again; - if (err) { - ret = err; - goto done; - } - - if (!p->skip_locking) { - level = btrfs_header_level(b); - if (level <= write_lock_level) { - err = btrfs_try_tree_write_lock(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_WRITE_LOCK); - } - p->locks[level] = BTRFS_WRITE_LOCK; - } else { - err = btrfs_try_tree_read_lock(b); - if (!err) { - btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); - btrfs_clear_path_blocking(p, b, - BTRFS_READ_LOCK); - } - p->locks[level] = BTRFS_READ_LOCK; - } - p->nodes[level] = b; - } - } else { - p->slots[level] = slot; - if (ins_len > 0 && - btrfs_leaf_free_space(root, b) < ins_len) { - if (write_lock_level < 1) { - write_lock_level = 1; - btrfs_release_path(p); - goto again; - } - - btrfs_set_path_blocking(p); - err = split_leaf(trans, root, key, - p, ins_len, ret == 0); - btrfs_clear_path_blocking(p, NULL, 0); - - BUG_ON(err > 0); - if (err) { - ret = err; - goto done; - } - } - if (!p->search_for_split) - unlock_up(p, level, lowest_unlock, - min_write_lock_level, &write_lock_level); - goto done; - } - } - ret = 1; -done: - /* - * we don't really know what they plan on doing with the path - * from here on, so for now just mark it as blocking - */ - if (!p->leave_spinning) - btrfs_set_path_blocking(p); - if (ret < 0) - btrfs_release_path(p); - return ret; -} - -/* - * adjust the pointers going up the tree, starting at level - * making sure the right key of each node is points to 'key'. - * This is used after shifting pointers to the left, so it stops - * fixing up pointers when a given leaf/node is not in slot 0 of the - * higher levels - * - */ -static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, int level) -{ - int i; - struct extent_buffer *t; - - for (i = level; i < BTRFS_MAX_LEVEL; i++) { - int tslot = path->slots[i]; - if (!path->nodes[i]) - break; - t = path->nodes[i]; - btrfs_set_node_key(t, key, tslot); - btrfs_mark_buffer_dirty(path->nodes[i]); - if (tslot != 0) - break; - } -} - -/* - * update item key. - * - * This function isn't completely safe. It's the caller's responsibility - * that the new key won't break the order - */ -void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key) -{ - struct btrfs_disk_key disk_key; - struct extent_buffer *eb; - int slot; - - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot > 0) { - btrfs_item_key(eb, &disk_key, slot - 1); - BUG_ON(comp_keys(&disk_key, new_key) >= 0); - } - if (slot < btrfs_header_nritems(eb) - 1) { - btrfs_item_key(eb, &disk_key, slot + 1); - BUG_ON(comp_keys(&disk_key, new_key) <= 0); - } - - btrfs_cpu_key_to_disk(&disk_key, new_key); - btrfs_set_item_key(eb, &disk_key, slot); - btrfs_mark_buffer_dirty(eb); - if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); -} - -/* - * try to push data from one node into the next node left in the - * tree. - * - * returns 0 if some ptrs were pushed left, < 0 if there was some horrible - * error, and > 0 if there was no room in the left hand block. - */ -static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, - struct extent_buffer *src, int empty) -{ - int push_items = 0; - int src_nritems; - int dst_nritems; - int ret = 0; - - src_nritems = btrfs_header_nritems(src); - dst_nritems = btrfs_header_nritems(dst); - push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; - WARN_ON(btrfs_header_generation(src) != trans->transid); - WARN_ON(btrfs_header_generation(dst) != trans->transid); - - if (!empty && src_nritems <= 8) - return 1; - - if (push_items <= 0) - return 1; - - if (empty) { - push_items = min(src_nritems, push_items); - if (push_items < src_nritems) { - /* leave at least 8 pointers in the node if - * we aren't going to empty it - */ - if (src_nritems - push_items < 8) { - if (push_items <= 8) - return 1; - push_items -= 8; - } - } - } else - push_items = min(src_nritems - 8, push_items); - - copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(dst_nritems), - btrfs_node_key_ptr_offset(0), - push_items * sizeof(struct btrfs_key_ptr)); - - if (push_items < src_nritems) { - memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(push_items), - (src_nritems - push_items) * - sizeof(struct btrfs_key_ptr)); - } - btrfs_set_header_nritems(src, src_nritems - push_items); - btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); - - return ret; -} - -/* - * try to push data from one node into the next node right in the - * tree. - * - * returns 0 if some ptrs were pushed, < 0 if there was some horrible - * error, and > 0 if there was no room in the right hand block. - * - * this will only push up to 1/2 the contents of the left node over - */ -static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *dst, - struct extent_buffer *src) -{ - int push_items = 0; - int max_push; - int src_nritems; - int dst_nritems; - int ret = 0; - - WARN_ON(btrfs_header_generation(src) != trans->transid); - WARN_ON(btrfs_header_generation(dst) != trans->transid); - - src_nritems = btrfs_header_nritems(src); - dst_nritems = btrfs_header_nritems(dst); - push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; - if (push_items <= 0) - return 1; - - if (src_nritems < 4) - return 1; - - max_push = src_nritems / 2 + 1; - /* don't try to empty the node */ - if (max_push >= src_nritems) - return 1; - - if (max_push < push_items) - push_items = max_push; - - memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), - btrfs_node_key_ptr_offset(0), - (dst_nritems) * - sizeof(struct btrfs_key_ptr)); - - copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(src_nritems - push_items), - push_items * sizeof(struct btrfs_key_ptr)); - - btrfs_set_header_nritems(src, src_nritems - push_items); - btrfs_set_header_nritems(dst, dst_nritems + push_items); - - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); - - return ret; -} - -/* - * helper function to insert a new root level in the tree. - * A new node is allocated, and a single item is inserted to - * point to the existing root - * - * returns zero on success or < 0 on failure. - */ -static noinline int insert_new_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - u64 lower_gen; - struct extent_buffer *lower; - struct extent_buffer *c; - struct extent_buffer *old; - struct btrfs_disk_key lower_key; - - BUG_ON(path->nodes[level]); - BUG_ON(path->nodes[level-1] != root->node); - - lower = path->nodes[level-1]; - if (level == 1) - btrfs_item_key(lower, &lower_key, 0); - else - btrfs_node_key(lower, &lower_key, 0); - - c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, - level, root->node->start, 0, 0); - if (IS_ERR(c)) - return PTR_ERR(c); - - root_add_used(root, root->nodesize); - - memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_nritems(c, 1); - btrfs_set_header_level(c, level); - btrfs_set_header_bytenr(c, c->start); - btrfs_set_header_generation(c, trans->transid); - btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(c, root->root_key.objectid); - - write_extent_buffer(c, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(c), - BTRFS_FSID_SIZE); - - write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(c), - BTRFS_UUID_SIZE); - - btrfs_set_node_key(c, &lower_key, 0); - btrfs_set_node_blockptr(c, 0, lower->start); - lower_gen = btrfs_header_generation(lower); - WARN_ON(lower_gen != trans->transid); - - btrfs_set_node_ptr_generation(c, 0, lower_gen); - - btrfs_mark_buffer_dirty(c); - - old = root->node; - rcu_assign_pointer(root->node, c); - - /* the super has an extra ref to root->node */ - free_extent_buffer(old); - - add_root_to_dirty_list(root); - extent_buffer_get(c); - path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK; - path->slots[level] = 0; - return 0; -} - -/* - * worker function to insert a single pointer in a node. - * the node should have enough room for the pointer already - * - * slot and level indicate where you want the key to go, and - * blocknr is the block the key points to. - */ -static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) -{ - struct extent_buffer *lower; - int nritems; - - BUG_ON(!path->nodes[level]); - btrfs_assert_tree_locked(path->nodes[level]); - lower = path->nodes[level]; - nritems = btrfs_header_nritems(lower); - BUG_ON(slot > nritems); - BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); - if (slot != nritems) { - memmove_extent_buffer(lower, - btrfs_node_key_ptr_offset(slot + 1), - btrfs_node_key_ptr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_key_ptr)); - } - btrfs_set_node_key(lower, key, slot); - btrfs_set_node_blockptr(lower, slot, bytenr); - WARN_ON(trans->transid == 0); - btrfs_set_node_ptr_generation(lower, slot, trans->transid); - btrfs_set_header_nritems(lower, nritems + 1); - btrfs_mark_buffer_dirty(lower); -} - -/* - * split the node at the specified level in path in two. - * The path is corrected to point to the appropriate node after the split - * - * Before splitting this tries to make some room in the node by pushing - * left and right, if either one works, it returns right away. - * - * returns 0 on success and < 0 on failure - */ -static noinline int split_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - struct extent_buffer *c; - struct extent_buffer *split; - struct btrfs_disk_key disk_key; - int mid; - int ret; - u32 c_nritems; - - c = path->nodes[level]; - WARN_ON(btrfs_header_generation(c) != trans->transid); - if (c == root->node) { - /* trying to split the root, lets make a new one */ - ret = insert_new_root(trans, root, path, level + 1); - if (ret) - return ret; - } else { - ret = push_nodes_for_insert(trans, root, path, level); - c = path->nodes[level]; - if (!ret && btrfs_header_nritems(c) < - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) - return 0; - if (ret < 0) - return ret; - } - - c_nritems = btrfs_header_nritems(c); - mid = (c_nritems + 1) / 2; - btrfs_node_key(c, &disk_key, mid); - - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, level, c->start, 0, 0); - if (IS_ERR(split)) - return PTR_ERR(split); - - root_add_used(root, root->nodesize); - - memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_level(split, btrfs_header_level(c)); - btrfs_set_header_bytenr(split, split->start); - btrfs_set_header_generation(split, trans->transid); - btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(split, root->root_key.objectid); - write_extent_buffer(split, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(split), - BTRFS_FSID_SIZE); - write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(split), - BTRFS_UUID_SIZE); - - - copy_extent_buffer(split, c, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(mid), - (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); - btrfs_set_header_nritems(split, c_nritems - mid); - btrfs_set_header_nritems(c, mid); - ret = 0; - - btrfs_mark_buffer_dirty(c); - btrfs_mark_buffer_dirty(split); - - insert_ptr(trans, root, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); - - if (path->slots[level] >= mid) { - path->slots[level] -= mid; - btrfs_tree_unlock(c); - free_extent_buffer(c); - path->nodes[level] = split; - path->slots[level + 1] += 1; - } else { - btrfs_tree_unlock(split); - free_extent_buffer(split); - } - return ret; -} - -/* - * how many bytes are required to store the items in a leaf. start - * and nr indicate which items in the leaf to check. This totals up the - * space used both by the item structs and the item data - */ -static int leaf_space_used(struct extent_buffer *l, int start, int nr) -{ - int data_len; - int nritems = btrfs_header_nritems(l); - int end = min(nritems, start + nr) - 1; - - if (!nr) - return 0; - data_len = btrfs_item_end_nr(l, start); - data_len = data_len - btrfs_item_offset_nr(l, end); - data_len += sizeof(struct btrfs_item) * nr; - WARN_ON(data_len < 0); - return data_len; -} - -/* - * The space between the end of the leaf items and - * the start of the leaf data. IOW, how much room - * the leaf has left for both items and data - */ -noinline int btrfs_leaf_free_space(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - int nritems = btrfs_header_nritems(leaf); - int ret; - ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); - if (ret < 0) { - printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " - "used %d nritems %d\n", - ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), - leaf_space_used(leaf, 0, nritems), nritems); - } - return ret; -} - -/* - * min slot controls the lowest index we're willing to push to the - * right. We'll push up to and including min_slot, but no lower - */ -static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int data_size, int empty, - struct extent_buffer *right, - int free_space, u32 left_nritems, - u32 min_slot) -{ - struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *upper = path->nodes[1]; - struct btrfs_map_token token; - struct btrfs_disk_key disk_key; - int slot; - u32 i; - int push_space = 0; - int push_items = 0; - struct btrfs_item *item; - u32 nr; - u32 right_nritems; - u32 data_end; - u32 this_item_size; - - btrfs_init_map_token(&token); - - if (empty) - nr = 0; - else - nr = max_t(u32, 1, min_slot); - - if (path->slots[0] >= left_nritems) - push_space += data_size; - - slot = path->slots[1]; - i = left_nritems - 1; - while (i >= nr) { - item = btrfs_item_nr(left, i); - - if (!empty && push_items > 0) { - if (path->slots[0] > i) - break; - if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(root, left); - if (space + push_space * 2 > free_space) - break; - } - } - - if (path->slots[0] == i) - push_space += data_size; - - this_item_size = btrfs_item_size(left, item); - if (this_item_size + sizeof(*item) + push_space > free_space) - break; - - push_items++; - push_space += this_item_size + sizeof(*item); - if (i == 0) - break; - i--; - } - - if (push_items == 0) - goto out_unlock; - - if (!empty && push_items == left_nritems) - WARN_ON(1); - - /* push left to right */ - right_nritems = btrfs_header_nritems(right); - - push_space = btrfs_item_end_nr(left, left_nritems - push_items); - push_space -= leaf_data_end(root, left); - - /* make room in the right data area */ - data_end = leaf_data_end(root, right); - memmove_extent_buffer(right, - btrfs_leaf_data(right) + data_end - push_space, - btrfs_leaf_data(right) + data_end, - BTRFS_LEAF_DATA_SIZE(root) - data_end); - - /* copy from the left data area */ - copy_extent_buffer(right, left, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, - btrfs_leaf_data(left) + leaf_data_end(root, left), - push_space); - - memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), - btrfs_item_nr_offset(0), - right_nritems * sizeof(struct btrfs_item)); - - /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(left_nritems - push_items), - push_items * sizeof(struct btrfs_item)); - - /* update the item pointers */ - right_nritems += push_items; - btrfs_set_header_nritems(right, right_nritems); - push_space = BTRFS_LEAF_DATA_SIZE(root); - for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); - push_space -= btrfs_token_item_size(right, item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); - } - - left_nritems -= push_items; - btrfs_set_header_nritems(left, left_nritems); - - if (left_nritems) - btrfs_mark_buffer_dirty(left); - else - clean_tree_block(trans, root, left); - - btrfs_mark_buffer_dirty(right); - - btrfs_item_key(right, &disk_key, 0); - btrfs_set_node_key(upper, &disk_key, slot + 1); - btrfs_mark_buffer_dirty(upper); - - /* then fixup the leaf pointer in the path */ - if (path->slots[0] >= left_nritems) { - path->slots[0] -= left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[1] += 1; - } else { - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - return 0; - -out_unlock: - btrfs_tree_unlock(right); - free_extent_buffer(right); - return 1; -} - -/* - * push some data in the path leaf to the right, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * returns 1 if the push failed because the other node didn't have enough - * room, 0 if everything worked out and < 0 if there were major errors. - * - * this will push starting from min_slot to the end of the leaf. It won't - * push any slot lower than min_slot - */ -static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, - int min_data_size, int data_size, - int empty, u32 min_slot) -{ - struct extent_buffer *left = path->nodes[0]; - struct extent_buffer *right; - struct extent_buffer *upper; - int slot; - int free_space; - u32 left_nritems; - int ret; - - if (!path->nodes[1]) - return 1; - - slot = path->slots[1]; - upper = path->nodes[1]; - if (slot >= btrfs_header_nritems(upper) - 1) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - right = read_node_slot(root, upper, slot + 1); - if (right == NULL) - return 1; - - btrfs_tree_lock(right); - btrfs_set_lock_blocking(right); - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); - if (ret) - goto out_unlock; - - free_space = btrfs_leaf_free_space(root, right); - if (free_space < data_size) - goto out_unlock; - - left_nritems = btrfs_header_nritems(left); - if (left_nritems == 0) - goto out_unlock; - - return __push_leaf_right(trans, root, path, min_data_size, empty, - right, free_space, left_nritems, min_slot); -out_unlock: - btrfs_tree_unlock(right); - free_extent_buffer(right); - return 1; -} - -/* - * push some data in the path leaf to the left, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * max_slot can put a limit on how far into the leaf we'll push items. The - * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the - * items - */ -static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int data_size, - int empty, struct extent_buffer *left, - int free_space, u32 right_nritems, - u32 max_slot) -{ - struct btrfs_disk_key disk_key; - struct extent_buffer *right = path->nodes[0]; - int i; - int push_space = 0; - int push_items = 0; - struct btrfs_item *item; - u32 old_left_nritems; - u32 nr; - int ret = 0; - u32 this_item_size; - u32 old_left_item_size; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - if (empty) - nr = min(right_nritems, max_slot); - else - nr = min(right_nritems - 1, max_slot); - - for (i = 0; i < nr; i++) { - item = btrfs_item_nr(right, i); - - if (!empty && push_items > 0) { - if (path->slots[0] < i) - break; - if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(root, right); - if (space + push_space * 2 > free_space) - break; - } - } - - if (path->slots[0] == i) - push_space += data_size; - - this_item_size = btrfs_item_size(right, item); - if (this_item_size + sizeof(*item) + push_space > free_space) - break; - - push_items++; - push_space += this_item_size + sizeof(*item); - } - - if (push_items == 0) { - ret = 1; - goto out; - } - if (!empty && push_items == btrfs_header_nritems(right)) - WARN_ON(1); - - /* push data from right to left */ - copy_extent_buffer(left, right, - btrfs_item_nr_offset(btrfs_header_nritems(left)), - btrfs_item_nr_offset(0), - push_items * sizeof(struct btrfs_item)); - - push_space = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_offset_nr(right, push_items - 1); - - copy_extent_buffer(left, right, btrfs_leaf_data(left) + - leaf_data_end(root, left) - push_space, - btrfs_leaf_data(right) + - btrfs_item_offset_nr(right, push_items - 1), - push_space); - old_left_nritems = btrfs_header_nritems(left); - BUG_ON(old_left_nritems <= 0); - - old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); - for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { - u32 ioff; - - item = btrfs_item_nr(left, i); - - ioff = btrfs_token_item_offset(left, item, &token); - btrfs_set_token_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), - &token); - } - btrfs_set_header_nritems(left, old_left_nritems + push_items); - - /* fixup right node */ - if (push_items > right_nritems) { - printk(KERN_CRIT "push items %d nr %u\n", push_items, - right_nritems); - WARN_ON(1); - } - - if (push_items < right_nritems) { - push_space = btrfs_item_offset_nr(right, push_items - 1) - - leaf_data_end(root, right); - memmove_extent_buffer(right, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, - btrfs_leaf_data(right) + - leaf_data_end(root, right), push_space); - - memmove_extent_buffer(right, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(push_items), - (btrfs_header_nritems(right) - push_items) * - sizeof(struct btrfs_item)); - } - right_nritems -= push_items; - btrfs_set_header_nritems(right, right_nritems); - push_space = BTRFS_LEAF_DATA_SIZE(root); - for (i = 0; i < right_nritems; i++) { - item = btrfs_item_nr(right, i); - - push_space = push_space - btrfs_token_item_size(right, - item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); - } - - btrfs_mark_buffer_dirty(left); - if (right_nritems) - btrfs_mark_buffer_dirty(right); - else - clean_tree_block(trans, root, right); - - btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, 1); - - /* then fixup the leaf pointer in the path */ - if (path->slots[0] < push_items) { - path->slots[0] += old_left_nritems; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = left; - path->slots[1] -= 1; - } else { - btrfs_tree_unlock(left); - free_extent_buffer(left); - path->slots[0] -= push_items; - } - BUG_ON(path->slots[0] < 0); - return ret; -out: - btrfs_tree_unlock(left); - free_extent_buffer(left); - return ret; -} - -/* - * push some data in the path leaf to the left, trying to free up at - * least data_size bytes. returns zero if the push worked, nonzero otherwise - * - * max_slot can put a limit on how far into the leaf we'll push items. The - * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the - * items - */ -static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int min_data_size, - int data_size, int empty, u32 max_slot) -{ - struct extent_buffer *right = path->nodes[0]; - struct extent_buffer *left; - int slot; - int free_space; - u32 right_nritems; - int ret = 0; - - slot = path->slots[1]; - if (slot == 0) - return 1; - if (!path->nodes[1]) - return 1; - - right_nritems = btrfs_header_nritems(right); - if (right_nritems == 0) - return 1; - - btrfs_assert_tree_locked(path->nodes[1]); - - left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) - return 1; - - btrfs_tree_lock(left); - btrfs_set_lock_blocking(left); - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - /* cow and double check */ - ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); - if (ret) { - /* we hit -ENOSPC, but it isn't fatal here */ - if (ret == -ENOSPC) - ret = 1; - goto out; - } - - free_space = btrfs_leaf_free_space(root, left); - if (free_space < data_size) { - ret = 1; - goto out; - } - - return __push_leaf_left(trans, root, path, min_data_size, - empty, left, free_space, right_nritems, - max_slot); -out: - btrfs_tree_unlock(left); - free_extent_buffer(left); - return ret; -} - -/* - * split the path's leaf in two, making sure there is at least data_size - * available for the resulting leaf level of the path. - */ -static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) -{ - int data_copy_size; - int rt_data_off; - int i; - struct btrfs_disk_key disk_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - nritems = nritems - mid; - btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); - - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), - nritems * sizeof(struct btrfs_item)); - - copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - - data_copy_size, btrfs_leaf_data(l) + - leaf_data_end(root, l), data_copy_size); - - rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end_nr(l, mid); - - for (i = 0; i < nritems; i++) { - struct btrfs_item *item = btrfs_item_nr(right, i); - u32 ioff; - - ioff = btrfs_token_item_offset(right, item, &token); - btrfs_set_token_item_offset(right, item, - ioff + rt_data_off, &token); - } - - btrfs_set_header_nritems(l, mid); - btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); - BUG_ON(path->slots[0] != slot); - - if (mid <= slot) { - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] -= mid; - path->slots[1] += 1; - } else { - btrfs_tree_unlock(right); - free_extent_buffer(right); - } - - BUG_ON(path->slots[0] < 0); -} - -/* - * double splits happen when we need to insert a big item in the middle - * of a leaf. A double split can leave us with 3 mostly empty leaves: - * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] - * A B C - * - * We avoid this by trying to push the items on either side of our target - * into the adjacent leaves. If all goes well we can avoid the double split - * completely. - */ -static noinline int push_for_double_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int data_size) -{ - int ret; - int progress = 0; - int slot; - u32 nritems; - - slot = path->slots[0]; - - /* - * try to push all the items after our slot into the - * right leaf - */ - ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); - if (ret < 0) - return ret; - - if (ret == 0) - progress++; - - nritems = btrfs_header_nritems(path->nodes[0]); - /* - * our goal is to get our slot at the start or end of a leaf. If - * we've done so we're done - */ - if (path->slots[0] == 0 || path->slots[0] == nritems) - return 0; - - if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) - return 0; - - /* try to push all the items before our slot into the next leaf */ - slot = path->slots[0]; - ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); - if (ret < 0) - return ret; - - if (ret == 0) - progress++; - - if (progress) - return 0; - return 1; -} - -/* - * split the path's leaf in two, making sure there is at least data_size - * available for the resulting leaf level of the path. - * - * returns 0 if all went well and < 0 on failure. - */ -static noinline int split_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *ins_key, - struct btrfs_path *path, int data_size, - int extend) -{ - struct btrfs_disk_key disk_key; - struct extent_buffer *l; - u32 nritems; - int mid; - int slot; - struct extent_buffer *right; - int ret = 0; - int wret; - int split; - int num_doubles = 0; - int tried_avoid_double = 0; - - l = path->nodes[0]; - slot = path->slots[0]; - if (extend && data_size + btrfs_item_size_nr(l, slot) + - sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) - return -EOVERFLOW; - - /* first try to make some room by pushing left and right */ - if (data_size) { - wret = push_leaf_right(trans, root, path, data_size, - data_size, 0, 0); - if (wret < 0) - return wret; - if (wret) { - wret = push_leaf_left(trans, root, path, data_size, - data_size, 0, (u32)-1); - if (wret < 0) - return wret; - } - l = path->nodes[0]; - - /* did the pushes work? */ - if (btrfs_leaf_free_space(root, l) >= data_size) - return 0; - } - - if (!path->nodes[1]) { - ret = insert_new_root(trans, root, path, 1); - if (ret) - return ret; - } -again: - split = 1; - l = path->nodes[0]; - slot = path->slots[0]; - nritems = btrfs_header_nritems(l); - mid = (nritems + 1) / 2; - - if (mid <= slot) { - if (nritems == 1 || - leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (slot >= nritems) { - split = 0; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - if (data_size && !tried_avoid_double) - goto push_for_double; - split = 2; - } - } - } - } else { - if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (!extend && data_size && slot == 0) { - split = 0; - } else if ((extend || !data_size) && slot == 0) { - mid = 1; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - if (data_size && !tried_avoid_double) - goto push_for_double; - split = 2 ; - } - } - } - } - - if (split == 0) - btrfs_cpu_key_to_disk(&disk_key, ins_key); - else - btrfs_item_key(l, &disk_key, mid); - - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - root->root_key.objectid, - &disk_key, 0, l->start, 0, 0); - if (IS_ERR(right)) - return PTR_ERR(right); - - root_add_used(root, root->leafsize); - - memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(right, right->start); - btrfs_set_header_generation(right, trans->transid); - btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(right, root->root_key.objectid); - btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); - - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), - BTRFS_UUID_SIZE); - - if (split == 0) { - if (mid <= slot) { - btrfs_set_header_nritems(right, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - path->slots[1] += 1; - } else { - btrfs_set_header_nritems(right, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - if (path->slots[1] == 0) - fixup_low_keys(trans, root, path, - &disk_key, 1); - } - btrfs_mark_buffer_dirty(right); - return ret; - } - - copy_for_split(trans, root, path, l, right, slot, mid, nritems); - - if (split == 2) { - BUG_ON(num_doubles != 0); - num_doubles++; - goto again; - } - - return 0; - -push_for_double: - push_for_double_split(trans, root, path, data_size); - tried_avoid_double = 1; - if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) - return 0; - goto again; -} - -static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int ins_len) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - u64 extent_len = 0; - u32 item_size; - int ret; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && - key.type != BTRFS_EXTENT_CSUM_KEY); - - if (btrfs_leaf_free_space(root, leaf) >= ins_len) - return 0; - - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (key.type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_len = btrfs_file_extent_num_bytes(leaf, fi); - } - btrfs_release_path(path); - - path->keep_locks = 1; - path->search_for_split = 1; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - path->search_for_split = 0; - if (ret < 0) - goto err; - - ret = -EAGAIN; - leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) - goto err; - - /* the leaf has changed, it now has room. return now */ - if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) - goto err; - - if (key.type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) - goto err; - } - - btrfs_set_path_blocking(path); - ret = split_leaf(trans, root, &key, path, ins_len, 1); - if (ret) - goto err; - - path->keep_locks = 0; - btrfs_unlock_up_safe(path, 1); - return 0; -err: - path->keep_locks = 0; - return ret; -} - -static noinline int split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key, - unsigned long split_offset) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - struct btrfs_item *new_item; - int slot; - char *buf; - u32 nritems; - u32 item_size; - u32 orig_offset; - struct btrfs_disk_key disk_key; - - leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); - - btrfs_set_path_blocking(path); - - item = btrfs_item_nr(leaf, path->slots[0]); - orig_offset = btrfs_item_offset(leaf, item); - item_size = btrfs_item_size(leaf, item); - - buf = kmalloc(item_size, GFP_NOFS); - if (!buf) - return -ENOMEM; - - read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, - path->slots[0]), item_size); - - slot = path->slots[0] + 1; - nritems = btrfs_header_nritems(leaf); - if (slot != nritems) { - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - } - - btrfs_cpu_key_to_disk(&disk_key, new_key); - btrfs_set_item_key(leaf, &disk_key, slot); - - new_item = btrfs_item_nr(leaf, slot); - - btrfs_set_item_offset(leaf, new_item, orig_offset); - btrfs_set_item_size(leaf, new_item, item_size - split_offset); - - btrfs_set_item_offset(leaf, item, - orig_offset + item_size - split_offset); - btrfs_set_item_size(leaf, item, split_offset); - - btrfs_set_header_nritems(leaf, nritems + 1); - - /* write the data for the start of the original item */ - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, path->slots[0]), - split_offset); - - /* write the data for the new item */ - write_extent_buffer(leaf, buf + split_offset, - btrfs_item_ptr_offset(leaf, slot), - item_size - split_offset); - btrfs_mark_buffer_dirty(leaf); - - BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); - kfree(buf); - return 0; -} - -/* - * This function splits a single item into two items, - * giving 'new_key' to the new item and splitting the - * old one at split_offset (from the start of the item). - * - * The path may be released by this operation. After - * the split, the path is pointing to the old item. The - * new item is going to be in the same node as the old one. - * - * Note, the item being split must be smaller enough to live alone on - * a tree block with room for one extra struct btrfs_item - * - * This allows us to split the item in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key, - unsigned long split_offset) -{ - int ret; - ret = setup_leaf_for_split(trans, root, path, - sizeof(struct btrfs_item)); - if (ret) - return ret; - - ret = split_item(trans, root, path, new_key, split_offset); - return ret; -} - -/* - * This function duplicate a item, giving 'new_key' to the new item. - * It guarantees both items live in the same tree leaf and the new item - * is contiguous with the original item. - * - * This allows us to split file extent in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_duplicate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key) -{ - struct extent_buffer *leaf; - int ret; - u32 item_size; - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ret = setup_leaf_for_split(trans, root, path, - item_size + sizeof(struct btrfs_item)); - if (ret) - return ret; - - path->slots[0]++; - setup_items_for_insert(trans, root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); - leaf = path->nodes[0]; - memcpy_extent_buffer(leaf, - btrfs_item_ptr_offset(leaf, path->slots[0]), - btrfs_item_ptr_offset(leaf, path->slots[0] - 1), - item_size); - return 0; -} - -/* - * make the item pointed to by the path smaller. new_size indicates - * how small to make it, and from_end tells us if we just chop bytes - * off the end of the item or if we shift the item to chop bytes off - * the front. - */ -void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end) -{ - int slot; - struct extent_buffer *leaf; - struct btrfs_item *item; - u32 nritems; - unsigned int data_end; - unsigned int old_data_start; - unsigned int old_size; - unsigned int size_diff; - int i; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - leaf = path->nodes[0]; - slot = path->slots[0]; - - old_size = btrfs_item_size_nr(leaf, slot); - if (old_size == new_size) - return; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - old_data_start = btrfs_item_offset_nr(leaf, slot); - - size_diff = old_size - new_size; - - BUG_ON(slot < 0); - BUG_ON(slot >= nritems); - - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - item = btrfs_item_nr(leaf, i); - - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + size_diff, &token); - } - - /* shift the data */ - if (from_end) { - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + - data_end, old_data_start + new_size - data_end); - } else { - struct btrfs_disk_key disk_key; - u64 offset; - - btrfs_item_key(leaf, &disk_key, slot); - - if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { - unsigned long ptr; - struct btrfs_file_extent_item *fi; - - fi = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - fi = (struct btrfs_file_extent_item *)( - (unsigned long)fi - size_diff); - - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) { - ptr = btrfs_item_ptr_offset(leaf, slot); - memmove_extent_buffer(leaf, ptr, - (unsigned long)fi, - offsetof(struct btrfs_file_extent_item, - disk_bytenr)); - } - } - - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + size_diff, btrfs_leaf_data(leaf) + - data_end, old_data_start - data_end); - - offset = btrfs_disk_key_offset(&disk_key); - btrfs_set_disk_key_offset(&disk_key, offset + size_diff); - btrfs_set_item_key(leaf, &disk_key, slot); - if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); - } - - item = btrfs_item_nr(leaf, slot); - btrfs_set_item_size(leaf, item, new_size); - btrfs_mark_buffer_dirty(leaf); - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -} - -/* - * make the item pointed to by the path bigger, data_size is the new size. - */ -void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size) -{ - int slot; - struct extent_buffer *leaf; - struct btrfs_item *item; - u32 nritems; - unsigned int data_end; - unsigned int old_data; - unsigned int old_size; - int i; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - leaf = path->nodes[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < data_size) { - btrfs_print_leaf(root, leaf); - BUG(); - } - slot = path->slots[0]; - old_data = btrfs_item_end_nr(leaf, slot); - - BUG_ON(slot < 0); - if (slot >= nritems) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d too large, nritems %d\n", - slot, nritems); - BUG_ON(1); - } - - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - item = btrfs_item_nr(leaf, i); - - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - data_size, &token); - } - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - data_size, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - - data_end = old_data; - old_size = btrfs_item_size_nr(leaf, slot); - item = btrfs_item_nr(leaf, slot); - btrfs_set_item_size(leaf, item, old_size + data_size); - btrfs_mark_buffer_dirty(leaf); - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -} - -/* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - * Returns the number of keys that were inserted. - */ -int btrfs_insert_some_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - int ret = 0; - int slot; - int i; - u32 nritems; - u32 total_data = 0; - u32 total_size = 0; - unsigned int data_end; - struct btrfs_disk_key disk_key; - struct btrfs_key found_key; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - for (i = 0; i < nr; i++) { - if (total_size + data_size[i] + sizeof(struct btrfs_item) > - BTRFS_LEAF_DATA_SIZE(root)) { - break; - nr = i; - } - total_data += data_size[i]; - total_size += data_size[i] + sizeof(struct btrfs_item); - } - BUG_ON(nr == 0); - - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - goto out; - - leaf = path->nodes[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < total_size) { - for (i = nr; i >= 0; i--) { - total_data -= data_size[i]; - total_size -= data_size[i] + sizeof(struct btrfs_item); - if (total_size < btrfs_leaf_free_space(root, leaf)) - break; - } - nr = i; - } - - slot = path->slots[0]; - BUG_ON(slot < 0); - - if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* figure out how many keys we can insert in here */ - total_data = data_size[0]; - for (i = 1; i < nr; i++) { - if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) - break; - total_data += data_size[i]; - } - nr = i; - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); - } - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); - } - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - data_end = old_data; - } else { - /* - * this sucks but it has to be done, if we are inserting at - * the end of the leaf only insert 1 of the items, since we - * have no way of knowing whats on the next leaf and we'd have - * to drop our current locks to figure it out - */ - nr = 1; - } - - /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); - btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); - data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); - } - btrfs_set_header_nritems(leaf, nritems + nr); - btrfs_mark_buffer_dirty(leaf); - - ret = 0; - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); - } - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -out: - if (!ret) - ret = nr; - return ret; -} - -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot - */ -void setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) -{ - struct btrfs_item *item; - int i; - u32 nritems; - unsigned int data_end; - struct btrfs_disk_key disk_key; - struct extent_buffer *leaf; - int slot; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - leaf = path->nodes[0]; - slot = path->slots[0]; - - nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); - - if (btrfs_leaf_free_space(root, leaf) < total_size) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "not enough freespace need %u have %d\n", - total_size, btrfs_leaf_free_space(root, leaf)); - BUG(); - } - - if (slot != nritems) { - unsigned int old_data = btrfs_item_end_nr(leaf, slot); - - if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d old_data %d data_end %d\n", - slot, old_data, data_end); - BUG_ON(1); - } - /* - * item0..itemN ... dataN.offset..dataN.size .. data0.size - */ - /* first correct the data pointers */ - for (i = slot; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); - } - /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - - /* shift the data */ - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end - total_data, btrfs_leaf_data(leaf) + - data_end, old_data - data_end); - data_end = old_data; - } - - /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); - btrfs_set_item_key(leaf, &disk_key, slot + i); - item = btrfs_item_nr(leaf, slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); - data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); - } - - btrfs_set_header_nritems(leaf, nritems + nr); - - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); - } - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(leaf); - - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } -} - -/* - * Given a key and some data, insert items into the tree. - * This does all the path init required, making room in the tree if needed. - */ -int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - int nr) -{ - int ret = 0; - int slot; - int i; - u32 total_size = 0; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; - - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); - if (ret == 0) - return -EEXIST; - if (ret < 0) - return ret; - - slot = path->slots[0]; - BUG_ON(slot < 0); - - setup_items_for_insert(trans, root, path, cpu_key, data_size, - total_data, total_size, nr); - return 0; -} - -/* - * Given a key and some data, insert an item into the tree. - * This does all the path init required, making room in the tree if needed. - */ -int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *cpu_key, void *data, u32 - data_size) -{ - int ret = 0; - struct btrfs_path *path; - struct extent_buffer *leaf; - unsigned long ptr; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); - if (!ret) { - leaf = path->nodes[0]; - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - write_extent_buffer(leaf, data, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); - } - btrfs_free_path(path); - return ret; -} - -/* - * delete the pointer from a given node. - * - * the tree should have been previously balanced so the deletion does not - * empty a node. - */ -static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) -{ - struct extent_buffer *parent = path->nodes[level]; - u32 nritems; - - nritems = btrfs_header_nritems(parent); - if (slot != nritems - 1) { - memmove_extent_buffer(parent, - btrfs_node_key_ptr_offset(slot), - btrfs_node_key_ptr_offset(slot + 1), - sizeof(struct btrfs_key_ptr) * - (nritems - slot - 1)); - } - nritems--; - btrfs_set_header_nritems(parent, nritems); - if (nritems == 0 && parent == root->node) { - BUG_ON(btrfs_header_level(root->node) != 1); - /* just turn the root into a leaf and break */ - btrfs_set_header_level(root->node, 0); - } else if (slot == 0) { - struct btrfs_disk_key disk_key; - - btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, level + 1); - } - btrfs_mark_buffer_dirty(parent); -} - -/* - * a helper function to delete the leaf pointed to by path->slots[1] and - * path->nodes[1]. - * - * This deletes the pointer in path->nodes[1] and frees the leaf - * block extent. zero is returned if it all worked out, < 0 otherwise. - * - * The path must have already been setup for deleting the leaf, including - * all the proper balancing. path->nodes[1] must be locked. - */ -static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) -{ - WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1]); - - /* - * btrfs_free_extent is expensive, we want to make sure we - * aren't holding any locks when we call it - */ - btrfs_unlock_up_safe(path, 0); - - root_sub_used(root, leaf->len); - - extent_buffer_get(leaf); - btrfs_free_tree_block(trans, root, leaf, 0, 1, 0); - free_extent_buffer_stale(leaf); -} -/* - * delete the item at the leaf level in path. If that empties - * the leaf, remove it from the tree - */ -int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int slot, int nr) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - int last_off; - int dsize = 0; - int ret = 0; - int wret; - int i; - u32 nritems; - struct btrfs_map_token token; - - btrfs_init_map_token(&token); - - leaf = path->nodes[0]; - last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size_nr(leaf, slot + i); - - nritems = btrfs_header_nritems(leaf); - - if (slot + nr != nritems) { - int data_end = leaf_data_end(root, leaf); - - memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + - data_end + dsize, - btrfs_leaf_data(leaf) + data_end, - last_off - data_end); - - for (i = slot + nr; i < nritems; i++) { - u32 ioff; - - item = btrfs_item_nr(leaf, i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + dsize, &token); - } - - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), - btrfs_item_nr_offset(slot + nr), - sizeof(struct btrfs_item) * - (nritems - slot - nr)); - } - btrfs_set_header_nritems(leaf, nritems - nr); - nritems -= nr; - - /* delete the leaf if we've emptied it */ - if (nritems == 0) { - if (leaf == root->node) { - btrfs_set_header_level(leaf, 0); - } else { - btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); - btrfs_del_leaf(trans, root, path, leaf); - } - } else { - int used = leaf_space_used(leaf, 0, nritems); - if (slot == 0) { - struct btrfs_disk_key disk_key; - - btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, 1); - } - - /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { - /* push_leaf_left fixes the path. - * make sure the path still points to our leaf - * for possible call to del_ptr below - */ - slot = path->slots[1]; - extent_buffer_get(leaf); - - btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); - if (wret < 0 && wret != -ENOSPC) - ret = wret; - - if (path->nodes[0] == leaf && - btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); - if (wret < 0 && wret != -ENOSPC) - ret = wret; - } - - if (btrfs_header_nritems(leaf) == 0) { - path->slots[1] = slot; - btrfs_del_leaf(trans, root, path, leaf); - free_extent_buffer(leaf); - ret = 0; - } else { - /* if we're still in the path, make sure - * we're dirty. Otherwise, one of the - * push_leaf functions must have already - * dirtied this buffer - */ - if (path->nodes[0] == leaf) - btrfs_mark_buffer_dirty(leaf); - free_extent_buffer(leaf); - } - } else { - btrfs_mark_buffer_dirty(leaf); - } - } - return ret; -} - -/* - * search the tree again to find a leaf with lesser keys - * returns 0 if it found something or 1 if there are no lesser leaves. - * returns < 0 on io errors. - * - * This may release the path, and so you may lose any locks held at the - * time you call it. - */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - struct btrfs_key key; - struct btrfs_disk_key found_key; - int ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - - if (key.offset > 0) - key.offset--; - else if (key.type > 0) - key.type--; - else if (key.objectid > 0) - key.objectid--; - else - return 1; - - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ret; - btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); - if (ret < 0) - return 0; - return 1; -} - -/* - * A helper function to walk down the tree starting at min_key, and looking - * for nodes or leaves that are either in cache or have a minimum - * transaction id. This is used by the btree defrag code, and tree logging - * - * This does not cow, but it does stuff the starting key it finds back - * into min_key, so you can call btrfs_search_slot with cow=1 on the - * key and get a writable path. - * - * This does lock as it descends, and path->keep_locks should be set - * to 1 by the caller. - * - * This honors path->lowest_level to prevent descent past a given level - * of the tree. - * - * min_trans indicates the oldest transaction that you are interested - * in walking through. Any nodes or leaves older than min_trans are - * skipped over (without reading them). - * - * returns zero if something useful was found, < 0 on error and 1 if there - * was nothing in the tree that matched the search criteria. - */ -int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, - u64 min_trans) -{ - struct extent_buffer *cur; - struct btrfs_key found_key; - int slot; - int sret; - u32 nritems; - int level; - int ret = 1; - - WARN_ON(!path->keep_locks); -again: - cur = btrfs_read_lock_root_node(root); - level = btrfs_header_level(cur); - WARN_ON(path->nodes[level]); - path->nodes[level] = cur; - path->locks[level] = BTRFS_READ_LOCK; - - if (btrfs_header_generation(cur) < min_trans) { - ret = 1; - goto out; - } - while (1) { - nritems = btrfs_header_nritems(cur); - level = btrfs_header_level(cur); - sret = bin_search(cur, min_key, level, &slot); - - /* at the lowest level, we're done, setup the path and exit */ - if (level == path->lowest_level) { - if (slot >= nritems) - goto find_next_key; - ret = 0; - path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); - goto out; - } - if (sret && slot > 0) - slot--; - /* - * check this node pointer against the cache_only and - * min_trans parameters. If it isn't in cache or is too - * old, skip to the next one. - */ - while (slot < nritems) { - u64 blockptr; - u64 gen; - struct extent_buffer *tmp; - struct btrfs_disk_key disk_key; - - blockptr = btrfs_node_blockptr(cur, slot); - gen = btrfs_node_ptr_generation(cur, slot); - if (gen < min_trans) { - slot++; - continue; - } - if (!cache_only) - break; - - if (max_key) { - btrfs_node_key(cur, &disk_key, slot); - if (comp_keys(&disk_key, max_key) >= 0) { - ret = 1; - goto out; - } - } - - tmp = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - - if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { - free_extent_buffer(tmp); - break; - } - if (tmp) - free_extent_buffer(tmp); - slot++; - } -find_next_key: - /* - * we didn't find a candidate key in this node, walk forward - * and find another one - */ - if (slot >= nritems) { - path->slots[level] = slot; - btrfs_set_path_blocking(path); - sret = btrfs_find_next_key(root, path, min_key, level, - cache_only, min_trans); - if (sret == 0) { - btrfs_release_path(path); - goto again; - } else { - goto out; - } - } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; - if (level == path->lowest_level) { - ret = 0; - unlock_up(path, level, 1, 0, NULL); - goto out; - } - btrfs_set_path_blocking(path); - cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ - - btrfs_tree_read_lock(cur); - - path->locks[level - 1] = BTRFS_READ_LOCK; - path->nodes[level - 1] = cur; - unlock_up(path, level, 1, 0, NULL); - btrfs_clear_path_blocking(path, NULL, 0); - } -out: - if (ret == 0) - memcpy(min_key, &found_key, sizeof(found_key)); - btrfs_set_path_blocking(path); - return ret; -} - -/* - * this is similar to btrfs_next_leaf, but does not try to preserve - * and fixup the path. It looks for and returns the next key in the - * tree based on the current path and the cache_only and min_trans - * parameters. - * - * 0 is returned if another key is found, < 0 if there are any errors - * and 1 is returned if there are no higher keys in the tree - * - * path->keep_locks should be set to 1 on the search made before - * calling this function. - */ -int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int level, - int cache_only, u64 min_trans) -{ - int slot; - struct extent_buffer *c; - - WARN_ON(!path->keep_locks); - while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; - - slot = path->slots[level] + 1; - c = path->nodes[level]; -next: - if (slot >= btrfs_header_nritems(c)) { - int ret; - int orig_lowest; - struct btrfs_key cur_key; - if (level + 1 >= BTRFS_MAX_LEVEL || - !path->nodes[level + 1]) - return 1; - - if (path->locks[level + 1]) { - level++; - continue; - } - - slot = btrfs_header_nritems(c) - 1; - if (level == 0) - btrfs_item_key_to_cpu(c, &cur_key, slot); - else - btrfs_node_key_to_cpu(c, &cur_key, slot); - - orig_lowest = path->lowest_level; - btrfs_release_path(path); - path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &cur_key, path, - 0, 0); - path->lowest_level = orig_lowest; - if (ret < 0) - return ret; - - c = path->nodes[level]; - slot = path->slots[level]; - if (ret == 0) - slot++; - goto next; - } - - if (level == 0) - btrfs_item_key_to_cpu(c, key, slot); - else { - u64 blockptr = btrfs_node_blockptr(c, slot); - u64 gen = btrfs_node_ptr_generation(c, slot); - - if (cache_only) { - struct extent_buffer *cur; - cur = btrfs_find_tree_block(root, blockptr, - btrfs_level_size(root, level - 1)); - if (!cur || - btrfs_buffer_uptodate(cur, gen, 1) <= 0) { - slot++; - if (cur) - free_extent_buffer(cur); - goto next; - } - free_extent_buffer(cur); - } - if (gen < min_trans) { - slot++; - goto next; - } - btrfs_node_key_to_cpu(c, key, slot); - } - return 0; - } - return 1; -} - -/* - * search the tree again to find a leaf with greater keys - * returns 0 if it found something or 1 if there are no greater leaves. - * returns < 0 on io errors. - */ -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - int slot; - int level; - struct extent_buffer *c; - struct extent_buffer *next; - struct btrfs_key key; - u32 nritems; - int ret; - int old_spinning = path->leave_spinning; - int next_rw_lock = 0; - - nritems = btrfs_header_nritems(path->nodes[0]); - if (nritems == 0) - return 1; - - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); -again: - level = 1; - next = NULL; - next_rw_lock = 0; - btrfs_release_path(path); - - path->keep_locks = 1; - path->leave_spinning = 1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - path->keep_locks = 0; - - if (ret < 0) - return ret; - - nritems = btrfs_header_nritems(path->nodes[0]); - /* - * by releasing the path above we dropped all our locks. A balance - * could have added more items next to the key that used to be - * at the very end of the block. So, check again here and - * advance the path if there are now more items available. - */ - if (nritems > 0 && path->slots[0] < nritems - 1) { - if (ret == 0) - path->slots[0]++; - ret = 0; - goto done; - } - - while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) { - ret = 1; - goto done; - } - - slot = path->slots[level] + 1; - c = path->nodes[level]; - if (slot >= btrfs_header_nritems(c)) { - level++; - if (level == BTRFS_MAX_LEVEL) { - ret = 1; - goto done; - } - continue; - } - - if (next) { - btrfs_tree_unlock_rw(next, next_rw_lock); - free_extent_buffer(next); - } - - next = c; - next_rw_lock = path->locks[level]; - ret = read_block_for_search(NULL, root, path, &next, level, - slot, &key); - if (ret == -EAGAIN) - goto again; - - if (ret < 0) { - btrfs_release_path(path); - goto done; - } - - if (!path->skip_locking) { - ret = btrfs_try_tree_read_lock(next); - if (!ret) { - btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); - } - next_rw_lock = BTRFS_READ_LOCK; - } - break; - } - path->slots[level] = slot; - while (1) { - level--; - c = path->nodes[level]; - if (path->locks[level]) - btrfs_tree_unlock_rw(c, path->locks[level]); - - free_extent_buffer(c); - path->nodes[level] = next; - path->slots[level] = 0; - if (!path->skip_locking) - path->locks[level] = next_rw_lock; - if (!level) - break; - - ret = read_block_for_search(NULL, root, path, &next, level, - 0, &key); - if (ret == -EAGAIN) - goto again; - - if (ret < 0) { - btrfs_release_path(path); - goto done; - } - - if (!path->skip_locking) { - ret = btrfs_try_tree_read_lock(next); - if (!ret) { - btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); - btrfs_clear_path_blocking(path, next, - BTRFS_READ_LOCK); - } - next_rw_lock = BTRFS_READ_LOCK; - } - } - ret = 0; -done: - unlock_up(path, 0, 1, 0, NULL); - path->leave_spinning = old_spinning; - if (!old_spinning) - btrfs_set_path_blocking(path); - - return ret; -} - -/* - * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps - * searching until it gets past min_objectid or finds an item of 'type' - * - * returns 0 if something is found, 1 if nothing was found and < 0 on error - */ -int btrfs_previous_item(struct btrfs_root *root, - struct btrfs_path *path, u64 min_objectid, - int type) -{ - struct btrfs_key found_key; - struct extent_buffer *leaf; - u32 nritems; - int ret; - - while (1) { - if (path->slots[0] == 0) { - btrfs_set_path_blocking(path); - ret = btrfs_prev_leaf(root, path); - if (ret != 0) - return ret; - } else { - path->slots[0]--; - } - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (nritems == 0) - return 1; - if (path->slots[0] == nritems) - path->slots[0]--; - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid < min_objectid) - break; - if (found_key.type == type) - return 0; - if (found_key.objectid == min_objectid && - found_key.type < type) - break; - } - return 1; -} diff --git a/ANDROID_3.4.5/fs/btrfs/ctree.h b/ANDROID_3.4.5/fs/btrfs/ctree.h deleted file mode 100644 index 8fd72331..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ctree.h +++ /dev/null @@ -1,3101 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_CTREE__ -#define __BTRFS_CTREE__ - -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/fs.h> -#include <linux/rwsem.h> -#include <linux/completion.h> -#include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <linux/kobject.h> -#include <trace/events/btrfs.h> -#include <asm/kmap_types.h> -#include <linux/pagemap.h> -#include "extent_io.h" -#include "extent_map.h" -#include "async-thread.h" -#include "ioctl.h" - -struct btrfs_trans_handle; -struct btrfs_transaction; -struct btrfs_pending_snapshot; -extern struct kmem_cache *btrfs_trans_handle_cachep; -extern struct kmem_cache *btrfs_transaction_cachep; -extern struct kmem_cache *btrfs_bit_radix_cachep; -extern struct kmem_cache *btrfs_path_cachep; -extern struct kmem_cache *btrfs_free_space_cachep; -struct btrfs_ordered_sum; - -#define BTRFS_MAGIC "_BHRfS_M" - -#define BTRFS_MAX_MIRRORS 2 - -#define BTRFS_MAX_LEVEL 8 - -#define BTRFS_COMPAT_EXTENT_TREE_V0 - -/* - * files bigger than this get some pre-flushing when they are added - * to the ordered operations list. That way we limit the total - * work done by the commit - */ -#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) - -/* holds pointers to all of the tree roots */ -#define BTRFS_ROOT_TREE_OBJECTID 1ULL - -/* stores information about which extents are in use, and reference counts */ -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL - -/* - * chunk tree stores translations from logical -> physical block numbering - * the super block points to the chunk tree - */ -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL - -/* - * stores information about which areas of a given device are in use. - * one per device. The tree of tree roots points to the device tree - */ -#define BTRFS_DEV_TREE_OBJECTID 4ULL - -/* one per subvolume, storing files and directories */ -#define BTRFS_FS_TREE_OBJECTID 5ULL - -/* directory objectid inside the root tree */ -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL - -/* holds checksums of all the data extents */ -#define BTRFS_CSUM_TREE_OBJECTID 7ULL - -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - -/* orhpan objectid for tracking unlinked/truncated files */ -#define BTRFS_ORPHAN_OBJECTID -5ULL - -/* does write ahead logging to speed up fsyncs */ -#define BTRFS_TREE_LOG_OBJECTID -6ULL -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL - -/* for space balancing */ -#define BTRFS_TREE_RELOC_OBJECTID -8ULL -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL - -/* - * extent checksums all have this objectid - * this allows them to share the logging tree - * for fsyncs - */ -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL - -/* For storing free space cache */ -#define BTRFS_FREE_SPACE_OBJECTID -11ULL - -/* - * The inode number assigned to the special inode for sotring - * free ino cache - */ -#define BTRFS_FREE_INO_OBJECTID -12ULL - -/* dummy objectid represents multiple objectids */ -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL - -/* - * All files have objectids in this range. - */ -#define BTRFS_FIRST_FREE_OBJECTID 256ULL -#define BTRFS_LAST_FREE_OBJECTID -256ULL -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL - - -/* - * the device items go into the chunk tree. The key is in the form - * [ 1 BTRFS_DEV_ITEM_KEY device_id ] - */ -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL - -#define BTRFS_BTREE_INODE_OBJECTID 1 - -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 - -/* - * the max metadata block size. This limit is somewhat artificial, - * but the memmove costs go through the roof for larger blocks. - */ -#define BTRFS_MAX_METADATA_BLOCKSIZE 65536 - -/* - * we can actually store much bigger names, but lets not confuse the rest - * of linux - */ -#define BTRFS_NAME_LEN 255 - -/* 32 bytes in various csum fields */ -#define BTRFS_CSUM_SIZE 32 - -/* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 - -static int btrfs_csum_sizes[] = { 4, 0 }; - -/* four bytes for CRC32 */ -#define BTRFS_EMPTY_DIR_SIZE 0 - -#define BTRFS_FT_UNKNOWN 0 -#define BTRFS_FT_REG_FILE 1 -#define BTRFS_FT_DIR 2 -#define BTRFS_FT_CHRDEV 3 -#define BTRFS_FT_BLKDEV 4 -#define BTRFS_FT_FIFO 5 -#define BTRFS_FT_SOCK 6 -#define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_XATTR 8 -#define BTRFS_FT_MAX 9 - -/* - * The key defines the order in the tree, and so it also defines (optimal) - * block layout. - * - * objectid corresponds to the inode number. - * - * type tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with type of 1 might refer to the inode data, - * type of 2 may point to file data in the btree and type == 3 may point to - * extents. - * - * offset is the starting byte offset for this key in the stream. - * - * btrfs_disk_key is in disk byte order. struct btrfs_key is always - * in cpu native order. Otherwise they are identical and their sizes - * should be the same (ie both packed) - */ -struct btrfs_disk_key { - __le64 objectid; - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_mapping_tree { - struct extent_map_tree map_tree; -}; - -struct btrfs_dev_item { - /* the internal btrfs device id */ - __le64 devid; - - /* size of the device */ - __le64 total_bytes; - - /* bytes used */ - __le64 bytes_used; - - /* optimal io alignment for this device */ - __le32 io_align; - - /* optimal io width for this device */ - __le32 io_width; - - /* minimal io size for this device */ - __le32 sector_size; - - /* type and info about this device */ - __le64 type; - - /* expected generation for this device */ - __le64 generation; - - /* - * starting byte of this partition on the device, - * to allow for stripe alignment in the future - */ - __le64 start_offset; - - /* grouping information for allocation decisions */ - __le32 dev_group; - - /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; - - /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; - - /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_stripe { - __le64 devid; - __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_chunk { - /* size of this chunk in bytes */ - __le64 length; - - /* objectid of the root referencing this chunk */ - __le64 owner; - - __le64 stripe_len; - __le64 type; - - /* optimal io alignment for this chunk */ - __le32 io_align; - - /* optimal io width for this chunk */ - __le32 io_width; - - /* minimal io size for this chunk */ - __le32 sector_size; - - /* 2^16 stripes is quite a lot, a second limit is the size of a single - * item in the btree - */ - __le16 num_stripes; - - /* sub stripes only matter for raid10 */ - __le16 sub_stripes; - struct btrfs_stripe stripe; - /* additional stripes go here */ -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_EXTENT 1 -#define BTRFS_FREE_SPACE_BITMAP 2 - -struct btrfs_free_space_entry { - __le64 offset; - __le64 bytes; - u8 type; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_header { - struct btrfs_disk_key location; - __le64 generation; - __le64 num_entries; - __le64 num_bitmaps; -} __attribute__ ((__packed__)); - -static inline unsigned long btrfs_chunk_item_size(int num_stripes) -{ - BUG_ON(num_stripes == 0); - return sizeof(struct btrfs_chunk) + - sizeof(struct btrfs_stripe) * (num_stripes - 1); -} - -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) - -/* - * File system states - */ - -/* Errors detected */ -#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) - -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) - -#define BTRFS_BACKREF_REV_MAX 256 -#define BTRFS_BACKREF_REV_SHIFT 56 -#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ - BTRFS_BACKREF_REV_SHIFT) - -#define BTRFS_OLD_BACKREF_REV 0 -#define BTRFS_MIXED_BACKREF_REV 1 - -/* - * every tree block (leaf or node) starts with this header. - */ -struct btrfs_header { - /* these first four must match the super block */ - u8 csum[BTRFS_CSUM_SIZE]; - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - __le64 bytenr; /* which block this node is supposed to live in */ - __le64 flags; - - /* allowed to be different from the super from here on down */ - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - __le64 generation; - __le64 owner; - __le32 nritems; - u8 level; -} __attribute__ ((__packed__)); - -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ - sizeof(struct btrfs_header)) / \ - sizeof(struct btrfs_key_ptr)) -#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) -#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) - \ - sizeof(struct btrfs_file_extent_item)) -#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) -\ - sizeof(struct btrfs_dir_item)) - - -/* - * this is a very generous portion of the super block, giving us - * room to translate 14 chunks with 3 stripes each. - */ -#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 -#define BTRFS_LABEL_SIZE 256 - -/* - * just in case we somehow lose the roots and are not able to mount, - * we store an array of the roots from previous transactions - * in the super. - */ -#define BTRFS_NUM_BACKUP_ROOTS 4 -struct btrfs_root_backup { - __le64 tree_root; - __le64 tree_root_gen; - - __le64 chunk_root; - __le64 chunk_root_gen; - - __le64 extent_root; - __le64 extent_root_gen; - - __le64 fs_root; - __le64 fs_root_gen; - - __le64 dev_root; - __le64 dev_root_gen; - - __le64 csum_root; - __le64 csum_root_gen; - - __le64 total_bytes; - __le64 bytes_used; - __le64 num_devices; - /* future */ - __le64 unsed_64[4]; - - u8 tree_root_level; - u8 chunk_root_level; - u8 extent_root_level; - u8 fs_root_level; - u8 dev_root_level; - u8 csum_root_level; - /* future and to align */ - u8 unused_8[10]; -} __attribute__ ((__packed__)); - -/* - * the super block basically lists the main trees of the FS - * it currently lacks any block count etc etc - */ -struct btrfs_super_block { - u8 csum[BTRFS_CSUM_SIZE]; - /* the first 4 fields must match struct btrfs_header */ - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - __le64 bytenr; /* this block number */ - __le64 flags; - - /* allowed to be different from the btrfs_header from here own down */ - __le64 magic; - __le64 generation; - __le64 root; - __le64 chunk_root; - __le64 log_root; - - /* this will help find the new super based on the log root */ - __le64 log_root_transid; - __le64 total_bytes; - __le64 bytes_used; - __le64 root_dir_objectid; - __le64 num_devices; - __le32 sectorsize; - __le32 nodesize; - __le32 leafsize; - __le32 stripesize; - __le32 sys_chunk_array_size; - __le64 chunk_root_generation; - __le64 compat_flags; - __le64 compat_ro_flags; - __le64 incompat_flags; - __le16 csum_type; - u8 root_level; - u8 chunk_root_level; - u8 log_root_level; - struct btrfs_dev_item dev_item; - - char label[BTRFS_LABEL_SIZE]; - - __le64 cache_generation; - - /* future expansion */ - __le64 reserved[31]; - u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; - struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; -} __attribute__ ((__packed__)); - -/* - * Compat flags that we support. If any incompat flags are set other than the - * ones specified below then we will fail to mount - */ -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) - -#define BTRFS_FEATURE_COMPAT_SUPP 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) - -/* - * A leaf is full of items. offset and size tell us where to find - * the item in the leaf (relative to the start of the data area) - */ -struct btrfs_item { - struct btrfs_disk_key key; - __le32 offset; - __le32 size; -} __attribute__ ((__packed__)); - -/* - * leaves have an item area and a data area: - * [item0, item1....itemN] [free space] [dataN...data1, data0] - * - * The data is separate from the items to get the keys closer together - * during searches. - */ -struct btrfs_leaf { - struct btrfs_header header; - struct btrfs_item items[]; -} __attribute__ ((__packed__)); - -/* - * all non-leaf blocks are nodes, they hold only keys and pointers to - * other blocks - */ -struct btrfs_key_ptr { - struct btrfs_disk_key key; - __le64 blockptr; - __le64 generation; -} __attribute__ ((__packed__)); - -struct btrfs_node { - struct btrfs_header header; - struct btrfs_key_ptr ptrs[]; -} __attribute__ ((__packed__)); - -/* - * btrfs_paths remember the path taken from the root down to the leaf. - * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point - * to any other levels that are present. - * - * The slots array records the index of the item or block pointer - * used while walking the tree. - */ -struct btrfs_path { - struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; - int slots[BTRFS_MAX_LEVEL]; - /* if there is real range locking, this locks field will change */ - int locks[BTRFS_MAX_LEVEL]; - int reada; - /* keep some upper locks as we walk down */ - int lowest_level; - - /* - * set by btrfs_split_item, tells search_slot to keep all locks - * and to force calls to keep space in the nodes - */ - unsigned int search_for_split:1; - unsigned int keep_locks:1; - unsigned int skip_locking:1; - unsigned int leave_spinning:1; - unsigned int search_commit_root:1; -}; - -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ - -struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { - __le32 refs; -} __attribute__ ((__packed__)); - -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ - sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -/* - * this flag is only used internally by scrub and may be changed at any time - * it is only declared here to avoid collisions - */ -#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - -/* dev extents record free space on individual devices. The owner - * field points back to the chunk allocation mapping tree that allocated - * the extent. The chunk tree uuid field is a way to double check the owner - */ -struct btrfs_dev_extent { - __le64 chunk_tree; - __le64 chunk_objectid; - __le64 chunk_offset; - __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_inode_ref { - __le64 index; - __le16 name_len; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_timespec { - __le64 sec; - __le32 nsec; -} __attribute__ ((__packed__)); - -enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_TYPES = 2, - BTRFS_COMPRESS_LAST = 3, -}; - -struct btrfs_inode_item { - /* nfs style generation number */ - __le64 generation; - /* transid that last touched this inode */ - __le64 transid; - __le64 size; - __le64 nbytes; - __le64 block_group; - __le32 nlink; - __le32 uid; - __le32 gid; - __le32 mode; - __le64 rdev; - __le64 flags; - - /* modification sequence number for NFS */ - __le64 sequence; - - /* - * a little future expansion, for more than this we can - * just grow the inode item and version it - */ - __le64 reserved[4]; - struct btrfs_timespec atime; - struct btrfs_timespec ctime; - struct btrfs_timespec mtime; - struct btrfs_timespec otime; -} __attribute__ ((__packed__)); - -struct btrfs_dir_log_item { - __le64 end; -} __attribute__ ((__packed__)); - -struct btrfs_dir_item { - struct btrfs_disk_key location; - __le64 transid; - __le16 data_len; - __le16 name_len; - u8 type; -} __attribute__ ((__packed__)); - -#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) - -struct btrfs_root_item { - struct btrfs_inode_item inode; - __le64 generation; - __le64 root_dirid; - __le64 bytenr; - __le64 byte_limit; - __le64 bytes_used; - __le64 last_snapshot; - __le64 flags; - __le32 refs; - struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; -} __attribute__ ((__packed__)); - -/* - * this is used for both forward and backward root refs - */ -struct btrfs_root_ref { - __le64 dirid; - __le64 sequence; - __le16 name_len; -} __attribute__ ((__packed__)); - -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* usage filter */ - __le64 usage; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - __le64 unused[8]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 - -struct btrfs_file_extent_item { - /* - * transaction id that created this extent - */ - __le64 generation; - /* - * max number of bytes to hold this extent in ram - * when we split a compressed extent we can't know how big - * each of the resulting pieces will be. So, this is - * an upper limit on the size of the extent in ram instead of - * an exact limit. - */ - __le64 ram_bytes; - - /* - * 32 bits for the various ways we might encode the data, - * including compression and encryption. If any of these - * are set to something a given disk format doesn't understand - * it is treated like an incompat flag for reading and writing, - * but not for stat. - */ - u8 compression; - u8 encryption; - __le16 other_encoding; /* spare for later use */ - - /* are we inline data or a real extent? */ - u8 type; - - /* - * disk space consumed by the extent, checksum blocks are included - * in these numbers - */ - __le64 disk_bytenr; - __le64 disk_num_bytes; - /* - * the logical offset in file blocks (no csums) - * this extent record is for. This allows a file extent to point - * into the middle of an existing extent on disk, sharing it - * between two snapshots (useful if some bytes in the middle of the - * extent have changed - */ - __le64 offset; - /* - * the logical number of file blocks (no csums included). This - * always reflects the size uncompressed and without encoding. - */ - __le64 num_bytes; - -} __attribute__ ((__packed__)); - -struct btrfs_csum_item { - u8 csum; -} __attribute__ ((__packed__)); - -/* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE -#define BTRFS_NR_RAID_TYPES 5 - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) - -#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ - BTRFS_AVAIL_ALLOC_BIT_SINGLE) - -static inline u64 chunk_to_extended(u64 flags) -{ - if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) - flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - return flags; -} -static inline u64 extended_to_chunk(u64 flags) -{ - return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; -} - -struct btrfs_block_group_item { - __le64 used; - __le64 chunk_objectid; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_space_info { - u64 flags; - - u64 total_bytes; /* total bytes in the space, - this doesn't take mirrors into account */ - u64 bytes_used; /* total bytes used, - this doesn't take mirrors into account */ - u64 bytes_pinned; /* total bytes pinned, will be freed when the - transaction finishes */ - u64 bytes_reserved; /* total bytes the allocator has reserved for - current allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - - u64 bytes_may_use; /* number of bytes that may be used for - delalloc/allocations */ - u64 disk_used; /* total bytes used on disk */ - u64 disk_total; /* total bytes on disk, takes mirrors into - account */ - - /* - * we bump reservation progress every time we decrement - * bytes_reserved. This way people waiting for reservations - * know something good has happened and they can check - * for progress. The number here isn't to be trusted, it - * just shows reclaim activity - */ - unsigned long reservation_progress; - - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - - struct list_head list; - - /* for block groups in our same type */ - struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - spinlock_t lock; - struct rw_semaphore groups_sem; - wait_queue_head_t wait; -}; - -struct btrfs_block_rsv { - u64 size; - u64 reserved; - struct btrfs_space_info *space_info; - spinlock_t lock; - unsigned int full; -}; - -/* - * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations and data allocations in ssd mode. - */ -struct btrfs_free_cluster { - spinlock_t lock; - spinlock_t refill_lock; - struct rb_root root; - - /* largest extent in this cluster */ - u64 max_size; - - /* first extent starting offset */ - u64 window_start; - - struct btrfs_block_group_cache *block_group; - /* - * when a cluster is allocated from a block group, we put the - * cluster onto a list in the block group so that it can - * be freed before the block group is freed. - */ - struct list_head block_group_list; -}; - -enum btrfs_caching_type { - BTRFS_CACHE_NO = 0, - BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FAST = 2, - BTRFS_CACHE_FINISHED = 3, -}; - -enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN = 0, - BTRFS_DC_ERROR = 1, - BTRFS_DC_CLEAR = 2, - BTRFS_DC_SETUP = 3, - BTRFS_DC_NEED_WRITE = 4, -}; - -struct btrfs_caching_control { - struct list_head list; - struct mutex mutex; - wait_queue_head_t wait; - struct btrfs_work work; - struct btrfs_block_group_cache *block_group; - u64 progress; - atomic_t count; -}; - -struct btrfs_block_group_cache { - struct btrfs_key key; - struct btrfs_block_group_item item; - struct btrfs_fs_info *fs_info; - struct inode *inode; - spinlock_t lock; - u64 pinned; - u64 reserved; - u64 bytes_super; - u64 flags; - u64 sectorsize; - u64 cache_generation; - unsigned int ro:1; - unsigned int dirty:1; - unsigned int iref:1; - - int disk_cache_state; - - /* cache tracking stuff */ - int cached; - struct btrfs_caching_control *caching_ctl; - u64 last_byte_to_unpin; - - struct btrfs_space_info *space_info; - - /* free space cache stuff */ - struct btrfs_free_space_ctl *free_space_ctl; - - /* block group cache stuff */ - struct rb_node cache_node; - - /* for block groups in the same raid type */ - struct list_head list; - - /* usage count */ - atomic_t count; - - /* List of struct btrfs_free_clusters for this block group. - * Today it will only have one thing on it, but that may change - */ - struct list_head cluster_list; -}; - -struct reloc_control; -struct btrfs_device; -struct btrfs_fs_devices; -struct btrfs_balance_control; -struct btrfs_delayed_root; -struct btrfs_fs_info { - u8 fsid[BTRFS_FSID_SIZE]; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - struct btrfs_root *extent_root; - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *fs_root; - struct btrfs_root *csum_root; - - /* the log root tree is a directory of all the other log roots */ - struct btrfs_root *log_root_tree; - - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; - - /* block group cache stuff */ - spinlock_t block_group_cache_lock; - struct rb_root block_group_cache_tree; - - /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; - - struct extent_io_tree freed_extents[2]; - struct extent_io_tree *pinned_extents; - - /* logical->physical extent mapping */ - struct btrfs_mapping_tree mapping_tree; - - /* - * block reservation for extent, checksum, root tree and - * delayed dir index item - */ - struct btrfs_block_rsv global_block_rsv; - /* block reservation for delay allocation */ - struct btrfs_block_rsv delalloc_block_rsv; - /* block reservation for metadata operations */ - struct btrfs_block_rsv trans_block_rsv; - /* block reservation for chunk tree */ - struct btrfs_block_rsv chunk_block_rsv; - /* block reservation for delayed operations */ - struct btrfs_block_rsv delayed_block_rsv; - - struct btrfs_block_rsv empty_block_rsv; - - u64 generation; - u64 last_trans_committed; - - /* - * this is updated to the current trans every time a full commit - * is required instead of the faster short fsync log commits - */ - u64 last_trans_log_full_commit; - unsigned long mount_opt; - unsigned long compress_type:4; - u64 max_inline; - u64 alloc_start; - struct btrfs_transaction *running_transaction; - wait_queue_head_t transaction_throttle; - wait_queue_head_t transaction_wait; - wait_queue_head_t transaction_blocked_wait; - wait_queue_head_t async_submit_wait; - - struct btrfs_super_block *super_copy; - struct btrfs_super_block *super_for_commit; - struct block_device *__bdev; - struct super_block *sb; - struct inode *btree_inode; - struct backing_dev_info bdi; - struct mutex tree_log_mutex; - struct mutex transaction_kthread_mutex; - struct mutex cleaner_mutex; - struct mutex chunk_mutex; - struct mutex volume_mutex; - /* - * this protects the ordered operations list only while we are - * processing all of the entries on it. This way we make - * sure the commit code doesn't find the list temporarily empty - * because another function happens to be doing non-waiting preflush - * before jumping into the main commit. - */ - struct mutex ordered_operations_mutex; - struct rw_semaphore extent_commit_sem; - - struct rw_semaphore cleanup_work_sem; - - struct rw_semaphore subvol_sem; - struct srcu_struct subvol_srcu; - - spinlock_t trans_lock; - /* - * the reloc mutex goes with the trans lock, it is taken - * during commit to protect us from the relocation code - */ - struct mutex reloc_mutex; - - struct list_head trans_list; - struct list_head hashers; - struct list_head dead_roots; - struct list_head caching_block_groups; - - spinlock_t delayed_iput_lock; - struct list_head delayed_iputs; - - atomic_t nr_async_submits; - atomic_t async_submit_draining; - atomic_t nr_async_bios; - atomic_t async_delalloc_pages; - atomic_t open_ioctl_trans; - - /* - * this is used by the balancing code to wait for all the pending - * ordered extents - */ - spinlock_t ordered_extent_lock; - - /* - * all of the data=ordered extents pending writeback - * these can span multiple transactions and basically include - * every dirty data page that isn't from nodatacow - */ - struct list_head ordered_extents; - - /* - * all of the inodes that have delalloc bytes. It is possible for - * this list to be empty even when there is still dirty data=ordered - * extents waiting to finish IO. - */ - struct list_head delalloc_inodes; - - /* - * special rename and truncate targets that must be on disk before - * we're allowed to commit. This is basically the ext3 style - * data=ordered list. - */ - struct list_head ordered_operations; - - /* - * there is a pool of worker threads for checksumming during writes - * and a pool for checksumming after reads. This is because readers - * can run with FS locks held, and the writers may be waiting for - * those locks. We don't want ordering in the pending list to cause - * deadlocks, and so the two are serviced separately. - * - * A third pool does submit_bio to avoid deadlocking with the other - * two - */ - struct btrfs_workers generic_worker; - struct btrfs_workers workers; - struct btrfs_workers delalloc_workers; - struct btrfs_workers endio_workers; - struct btrfs_workers endio_meta_workers; - struct btrfs_workers endio_meta_write_workers; - struct btrfs_workers endio_write_workers; - struct btrfs_workers endio_freespace_worker; - struct btrfs_workers submit_workers; - struct btrfs_workers caching_workers; - struct btrfs_workers readahead_workers; - - /* - * fixup workers take dirty pages that didn't properly go through - * the cow mechanism and make them safe to write. It happens - * for the sys_munmap function call path - */ - struct btrfs_workers fixup_workers; - struct btrfs_workers delayed_workers; - struct task_struct *transaction_kthread; - struct task_struct *cleaner_kthread; - int thread_pool_size; - - struct kobject super_kobj; - struct completion kobj_unregister; - int do_barriers; - int closing; - int log_root_recovering; - int enospc_unlink; - int trans_no_join; - - u64 total_pinned; - - /* protected by the delalloc lock, used to keep from writing - * metadata until there is a nice batch - */ - u64 dirty_metadata_bytes; - struct list_head dirty_cowonly_roots; - - struct btrfs_fs_devices *fs_devices; - - /* - * the space_info list is almost entirely read only. It only changes - * when we add a new raid type to the FS, and that happens - * very rarely. RCU is used to protect it. - */ - struct list_head space_info; - - struct reloc_control *reloc_ctl; - - spinlock_t delalloc_lock; - u64 delalloc_bytes; - - /* data_alloc_cluster is only used in ssd mode */ - struct btrfs_free_cluster data_alloc_cluster; - - /* all metadata allocations go through this cluster */ - struct btrfs_free_cluster meta_alloc_cluster; - - /* auto defrag inodes go here */ - spinlock_t defrag_inodes_lock; - struct rb_root defrag_inodes; - atomic_t defrag_running; - - spinlock_t ref_cache_lock; - u64 total_ref_cache_size; - - /* - * these three are in extended format (availability of single - * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other - * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) - */ - u64 avail_data_alloc_bits; - u64 avail_metadata_alloc_bits; - u64 avail_system_alloc_bits; - - /* restriper state */ - spinlock_t balance_lock; - struct mutex balance_mutex; - atomic_t balance_running; - atomic_t balance_pause_req; - atomic_t balance_cancel_req; - struct btrfs_balance_control *balance_ctl; - wait_queue_head_t balance_wait_q; - - unsigned data_chunk_allocations; - unsigned metadata_ratio; - - void *bdev_holder; - - /* private scrub information */ - struct mutex scrub_lock; - atomic_t scrubs_running; - atomic_t scrub_pause_req; - atomic_t scrubs_paused; - atomic_t scrub_cancel_req; - wait_queue_head_t scrub_pause_wait; - struct rw_semaphore scrub_super_lock; - int scrub_workers_refcnt; - struct btrfs_workers scrub_workers; - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif - - /* filesystem state */ - u64 fs_state; - - struct btrfs_delayed_root *delayed_root; - - /* readahead tree */ - spinlock_t reada_lock; - struct radix_tree_root reada_tree; - - /* next backup root to be overwritten */ - int backup_root_index; -}; - -/* - * in ram representation of the tree. extent_root is used for all allocations - * and for the extent tree extent_root root. - */ -struct btrfs_root { - struct extent_buffer *node; - - struct extent_buffer *commit_root; - struct btrfs_root *log_root; - struct btrfs_root *reloc_root; - - struct btrfs_root_item root_item; - struct btrfs_key root_key; - struct btrfs_fs_info *fs_info; - struct extent_io_tree dirty_log_pages; - - struct kobject root_kobj; - struct completion kobj_unregister; - struct mutex objectid_mutex; - - spinlock_t accounting_lock; - struct btrfs_block_rsv *block_rsv; - - /* free ino cache stuff */ - struct mutex fs_commit_mutex; - struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type cached; - spinlock_t cache_lock; - wait_queue_head_t cache_wait; - struct btrfs_free_space_ctl *free_ino_pinned; - u64 cache_progress; - struct inode *cache_inode; - - struct mutex log_mutex; - wait_queue_head_t log_writer_wait; - wait_queue_head_t log_commit_wait[2]; - atomic_t log_writers; - atomic_t log_commit[2]; - unsigned long log_transid; - unsigned long last_log_commit; - unsigned long log_batch; - pid_t log_start_pid; - bool log_multiple_pids; - - u64 objectid; - u64 last_trans; - - /* data allocations are done in sectorsize units */ - u32 sectorsize; - - /* node allocations are done in nodesize units */ - u32 nodesize; - - /* leaf allocations are done in leafsize units */ - u32 leafsize; - - u32 stripesize; - - u32 type; - - u64 highest_objectid; - - /* btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So in_trans_setup - * is used to tell us when more checks are required - */ - unsigned long in_trans_setup; - int ref_cows; - int track_dirty; - int in_radix; - - u64 defrag_trans_start; - struct btrfs_key defrag_progress; - struct btrfs_key defrag_max; - int defrag_running; - char *name; - - /* the dirty list is only used by non-reference counted roots */ - struct list_head dirty_list; - - struct list_head root_list; - - spinlock_t orphan_lock; - struct list_head orphan_list; - struct btrfs_block_rsv *orphan_block_rsv; - int orphan_item_inserted; - int orphan_cleanup_state; - - spinlock_t inode_lock; - /* red-black tree that keeps track of in-memory inodes */ - struct rb_root inode_tree; - - /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock - */ - struct radix_tree_root delayed_nodes_tree; - /* - * right now this just gets used so that a root has its own devid - * for stat. It may be used for more later - */ - dev_t anon_dev; - - int force_cow; -}; - -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* - * which compression method to use if turning on compression - * for this defrag operation. If unspecified, zlib will - * be used - */ - __u32 compress_type; - - /* spare for later */ - __u32 unused[4]; -}; - - -/* - * inode items have the data typically returned from stat and store other - * info about object characteristics. There is one for every file and dir in - * the FS - */ -#define BTRFS_INODE_ITEM_KEY 1 -#define BTRFS_INODE_REF_KEY 12 -#define BTRFS_XATTR_ITEM_KEY 24 -#define BTRFS_ORPHAN_ITEM_KEY 48 -/* reserve 2-15 close to the inode for later flexibility */ - -/* - * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. - */ -#define BTRFS_DIR_LOG_ITEM_KEY 60 -#define BTRFS_DIR_LOG_INDEX_KEY 72 -#define BTRFS_DIR_ITEM_KEY 84 -#define BTRFS_DIR_INDEX_KEY 96 -/* - * extent data is for file data - */ -#define BTRFS_EXTENT_DATA_KEY 108 - -/* - * extent csums are stored in a separate tree and hold csums for - * an entire extent on disk. - */ -#define BTRFS_EXTENT_CSUM_KEY 128 - -/* - * root items point to tree roots. They are typically in the root - * tree used by the super block to find all the other trees - */ -#define BTRFS_ROOT_ITEM_KEY 132 - -/* - * root backrefs tie subvols and snapshots to the directory entries that - * reference them - */ -#define BTRFS_ROOT_BACKREF_KEY 144 - -/* - * root refs make a fast index for listing all of the snapshots and - * subvolumes referenced by a given root. They point directly to the - * directory item in the root that references the subvol - */ -#define BTRFS_ROOT_REF_KEY 156 - -/* - * extent items are in the extent map tree. These record which blocks - * are used, and how many references there are to each block - */ -#define BTRFS_EXTENT_ITEM_KEY 168 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 - -/* - * block groups give us hints into the extent allocation trees. Which - * blocks are free etc etc - */ -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 - -#define BTRFS_DEV_EXTENT_KEY 204 -#define BTRFS_DEV_ITEM_KEY 216 -#define BTRFS_CHUNK_ITEM_KEY 228 - -#define BTRFS_BALANCE_ITEM_KEY 248 - -/* - * string items are for debugging. They just store a short string of - * data in the FS - */ -#define BTRFS_STRING_ITEM_KEY 253 - -/* - * Flags for mount options. - * - * Note: don't forget to add new options to btrfs_show_options() - */ -#define BTRFS_MOUNT_NODATASUM (1 << 0) -#define BTRFS_MOUNT_NODATACOW (1 << 1) -#define BTRFS_MOUNT_NOBARRIER (1 << 2) -#define BTRFS_MOUNT_SSD (1 << 3) -#define BTRFS_MOUNT_DEGRADED (1 << 4) -#define BTRFS_MOUNT_COMPRESS (1 << 5) -#define BTRFS_MOUNT_NOTREELOG (1 << 6) -#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) -#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) -#define BTRFS_MOUNT_NOSSD (1 << 9) -#define BTRFS_MOUNT_DISCARD (1 << 10) -#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) -#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) -#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) -#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) -#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) -#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) -#define BTRFS_MOUNT_RECOVERY (1 << 18) -#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) -#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) -#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) - -#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) -#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) -#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ - BTRFS_MOUNT_##opt) -/* - * Inode flags - */ -#define BTRFS_INODE_NODATASUM (1 << 0) -#define BTRFS_INODE_NODATACOW (1 << 1) -#define BTRFS_INODE_READONLY (1 << 2) -#define BTRFS_INODE_NOCOMPRESS (1 << 3) -#define BTRFS_INODE_PREALLOC (1 << 4) -#define BTRFS_INODE_SYNC (1 << 5) -#define BTRFS_INODE_IMMUTABLE (1 << 6) -#define BTRFS_INODE_APPEND (1 << 7) -#define BTRFS_INODE_NODUMP (1 << 8) -#define BTRFS_INODE_NOATIME (1 << 9) -#define BTRFS_INODE_DIRSYNC (1 << 10) -#define BTRFS_INODE_COMPRESS (1 << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) - -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - -static inline void btrfs_init_map_token (struct btrfs_map_token *token) -{ - memset(token, 0, sizeof(*token)); -} - -/* some macros to generate set/get funcs for the struct fields. This - * assumes there is a lefoo_to_cpu for every type, so lets make a simple - * one for u8: - */ -#define le8_to_cpu(v) (v) -#define cpu_to_le8(v) (v) -#define __le8 u8 - -#define read_eb_member(eb, ptr, type, member, result) ( \ - read_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define write_eb_member(eb, ptr, type, member, result) ( \ - write_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#ifndef BTRFS_SETGET_FUNCS -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); -#endif - -#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(struct extent_buffer *eb) \ -{ \ - type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ -} \ -static inline void btrfs_set_##name(struct extent_buffer *eb, \ - u##bits val) \ -{ \ - type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ -} - -#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(type *s) \ -{ \ - return le##bits##_to_cpu(s->member); \ -} \ -static inline void btrfs_set_##name(type *s, u##bits val) \ -{ \ - s->member = cpu_to_le##bits(val); \ -} - -BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); -BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); -BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); -BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); -BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, - start_offset, 64); -BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); -BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); -BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); -BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); -BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, - dev_group, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, - seek_speed, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, - bandwidth, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, - generation, 64); - -static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) -{ - return (char *)d + offsetof(struct btrfs_dev_item, uuid); -} - -static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) -{ - return (char *)d + offsetof(struct btrfs_dev_item, fsid); -} - -BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); -BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); -BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); -BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); -BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); -BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); -BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); - -static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) -{ - return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); -} - -BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, - stripe_len, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, - num_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, - sub_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); - -static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, - int nr) -{ - unsigned long offset = (unsigned long)c; - offset += offsetof(struct btrfs_chunk, stripe); - offset += nr * sizeof(struct btrfs_stripe); - return (struct btrfs_stripe *)offset; -} - -static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); -} - -/* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); - -BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(disk_block_group_flags, - struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(block_group_flags, - struct btrfs_block_group_item, flags, 64); - -/* struct btrfs_inode_ref */ -BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); -BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); - -/* struct btrfs_inode_item */ -BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); -BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); -BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); -BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); -BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); -BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); - -static inline struct btrfs_timespec * -btrfs_inode_atime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, atime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_mtime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, mtime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_ctime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, ctime); - return (struct btrfs_timespec *)ptr; -} - -BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); - -/* struct btrfs_dev_extent */ -BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, - chunk_tree, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, - chunk_objectid, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, - chunk_offset, 64); -BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); - -static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) -{ - unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); - return (u8 *)((unsigned long)dev + ptr); -} - -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); -BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); - -BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); - - -BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); - -static inline void btrfs_tree_block_key(struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, - root, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, - objectid, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, - offset, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, - type, 8); -BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, - offset, 64); - -static inline u32 btrfs_extent_inline_ref_size(int type) -{ - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - return sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_SHARED_DATA_REF_KEY) - return sizeof(struct btrfs_shared_data_ref) + - sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return sizeof(struct btrfs_extent_data_ref) + - offsetof(struct btrfs_extent_inline_ref, offset); - BUG(); - return 0; -} - -BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); -BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, - generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); -BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); - -/* struct btrfs_node */ -BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); -BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); - -static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline unsigned long btrfs_node_key_ptr_offset(int nr) -{ - return offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; -} - -void btrfs_node_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr); - -static inline void btrfs_set_node_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); - write_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} - -/* struct btrfs_item */ -BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); - -static inline unsigned long btrfs_item_nr_offset(int nr) -{ - return offsetof(struct btrfs_leaf, items) + - sizeof(struct btrfs_item) * nr; -} - -static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, - int nr) -{ - return (struct btrfs_item *)btrfs_item_nr_offset(nr); -} - -static inline u32 btrfs_item_end(struct extent_buffer *eb, - struct btrfs_item *item) -{ - return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); -} - -static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) -{ - return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); -} - -static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); -} - -static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) -{ - return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); -} - -static inline void btrfs_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(eb, nr); - read_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(eb, nr); - write_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); - -/* - * struct btrfs_root_ref - */ -BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); -BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); -BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); - -/* struct btrfs_dir_item */ -BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); -BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); - -static inline void btrfs_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, - num_entries, 64); -BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, - num_bitmaps, 64); -BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, - generation, 64); - -static inline void btrfs_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -static inline void btrfs_set_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -/* struct btrfs_disk_key */ -BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, - struct btrfs_disk_key *disk) -{ - cpu->offset = le64_to_cpu(disk->offset); - cpu->type = disk->type; - cpu->objectid = le64_to_cpu(disk->objectid); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, - struct btrfs_key *cpu) -{ - disk->offset = cpu_to_le64(cpu->offset); - disk->type = cpu->type; - disk->objectid = cpu_to_le64(cpu->objectid); -} - -static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_node_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_item_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, - struct btrfs_dir_item *item, - struct btrfs_key *key) -{ - struct btrfs_disk_key disk_key; - btrfs_dir_item_key(eb, item, &disk_key); - btrfs_disk_key_to_cpu(key, &disk_key); -} - - -static inline u8 btrfs_key_type(struct btrfs_key *key) -{ - return key->type; -} - -static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) -{ - key->type = val; -} - -/* struct btrfs_header */ -BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); -BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); -BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); -BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); - -static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) -{ - return (btrfs_header_flags(eb) & flag) == flag; -} - -static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags | flag); - return (flags & flag) == flag; -} - -static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags & ~flag); - return (flags & flag) == flag; -} - -static inline int btrfs_header_backref_rev(struct extent_buffer *eb) -{ - u64 flags = btrfs_header_flags(eb); - return flags >> BTRFS_BACKREF_REV_SHIFT; -} - -static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, - int rev) -{ - u64 flags = btrfs_header_flags(eb); - flags &= ~BTRFS_BACKREF_REV_MASK; - flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; - btrfs_set_header_flags(eb, flags); -} - -static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) -{ - unsigned long ptr = offsetof(struct btrfs_header, fsid); - return (u8 *)ptr; -} - -static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) -{ - unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); - return (u8 *)ptr; -} - -static inline int btrfs_is_leaf(struct extent_buffer *eb) -{ - return btrfs_header_level(eb) == 0; -} - -/* struct btrfs_root_item */ -BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); - -BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); -BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); -BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); -BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, - last_snapshot, 64); - -static inline bool btrfs_root_readonly(struct btrfs_root *root) -{ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -/* struct btrfs_root_backup */ -BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, - tree_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, - tree_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, - tree_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, - chunk_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, - chunk_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, - extent_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, - fs_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, - fs_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, - fs_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, - dev_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, - dev_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, - dev_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, - csum_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, - csum_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, - csum_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, - num_devices, 64); - -/* struct btrfs_balance_item */ -BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); - -static inline void btrfs_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void -btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); -} - -static inline void -btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); -} - -/* struct btrfs_super_block */ -BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); -BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, - struct btrfs_super_block, sys_chunk_array_size, 32); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, - struct btrfs_super_block, chunk_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, - root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, - chunk_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, - log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, - log_root_transid, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, - log_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, - sectorsize, 32); -BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, - nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, - leafsize, 32); -BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, - stripesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, - root_dir_objectid, 64); -BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, - num_devices, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, - compat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, - compat_ro_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, - incompat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, - csum_type, 16); -BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, - cache_generation, 64); - -static inline int btrfs_super_csum_size(struct btrfs_super_block *s) -{ - int t = btrfs_super_csum_type(s); - BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); - return btrfs_csum_sizes[t]; -} - -static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) -{ - return offsetof(struct btrfs_leaf, items); -} - -/* struct btrfs_file_extent_item */ -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); - -static inline unsigned long -btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) -{ - unsigned long offset = (unsigned long)e; - offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); - return offset; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; -} - -BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, - disk_bytenr, 64); -BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, - disk_num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, - offset, 64); -BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, - num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, - ram_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, - compression, 8); -BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, - encryption, 8); -BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, - other_encoding, 16); - -/* this returns the number of file bytes represented by the inline item. - * If an item is compressed, this is the uncompressed size - */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_file_extent_item *e) -{ - return btrfs_file_extent_ram_bytes(eb, e); -} - -/* - * this returns the number of bytes used by the item on disk, minus the - * size of any extent headers. If a file is compressed on disk, this is - * the compressed size - */ -static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, - struct btrfs_item *e) -{ - unsigned long offset; - offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); - return btrfs_item_size(eb, e) - offset; -} - -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - -static inline u32 btrfs_level_size(struct btrfs_root *root, int level) -{ - if (level == 0) - return root->leafsize; - return root->nodesize; -} - -/* helper function to cast into the data area of the leaf. */ -#define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(btrfs_leaf_data(leaf) + \ - btrfs_item_offset_nr(leaf, slot))) - -#define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(btrfs_leaf_data(leaf) + \ - btrfs_item_offset_nr(leaf, slot))) - -static inline struct dentry *fdentry(struct file *file) -{ - return file->f_path.dentry; -} - -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) -{ - return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && - (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); -} - -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) -{ - return mapping_gfp_mask(mapping) & ~__GFP_FS; -} - -/* extent-tree.c */ -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, - unsigned num_items) -{ - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - 3 * num_items; -} - -/* - * Doing a truncate won't result in new nodes or leaves, just what we need for - * COW. - */ -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, - unsigned num_items) -{ - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - num_items; -} - -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count); -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes); -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr); -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr); -void btrfs_put_block_group(struct btrfs_block_group_cache *cache); -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner); -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow); -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow); -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level); -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins); -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data); -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow); -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 flags, - int is_data); -int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow); - -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, - u64 start, u64 len); -void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow); - -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); -int btrfs_free_block_groups(struct btrfs_fs_info *info); -int btrfs_read_block_groups(struct btrfs_root *root); -int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytes_used, - u64 type, u64 chunk_objectid, u64 chunk_offset, - u64 size); -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start); -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); -void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_check_data_free_space(struct inode *inode, u64 bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); -void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct inode *inode); -void btrfs_orphan_release_metadata(struct inode *inode); -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); -int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); -void btrfs_free_block_rsv(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved); -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes); -void btrfs_block_rsv_release(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -int btrfs_set_block_group_ro(struct btrfs_root *root, - struct btrfs_block_group_cache *cache); -void btrfs_set_block_group_rw(struct btrfs_root *root, - struct btrfs_block_group_cache *cache); -void btrfs_put_block_group_cache(struct btrfs_fs_info *info); -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); -int btrfs_error_unpin_extent_range(struct btrfs_root *root, - u64 start, u64 end); -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type); -int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); - -int btrfs_init_space_info(struct btrfs_fs_info *fs_info); -/* ctree.c */ -int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, - int level, int *slot); -int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); -int btrfs_previous_item(struct btrfs_root *root, - struct btrfs_path *path, u64 min_objectid, - int type); -void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *new_key); -struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level, - int cache_only, u64 min_trans); -int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, - struct btrfs_key *max_key, - struct btrfs_path *path, int cache_only, - u64 min_trans); -int btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); -int btrfs_copy_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_root *root, - struct extent_buffer *buf); -void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u32 data_size); -void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u32 new_size, int from_end); -int btrfs_split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key, - unsigned long split_offset); -int btrfs_duplicate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key); -int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_path *p, int - ins_len, int cow); -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, int cache_only, u64 *last_ret, - struct btrfs_key *progress); -void btrfs_release_path(struct btrfs_path *p); -struct btrfs_path *btrfs_alloc_path(void); -void btrfs_free_path(struct btrfs_path *p); -void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_clear_path_blocking(struct btrfs_path *p, - struct extent_buffer *held, int held_rw); -void btrfs_unlock_up_safe(struct btrfs_path *p, int level); - -int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int slot, int nr); -static inline int btrfs_del_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) -{ - return btrfs_del_items(trans, root, path, path->slots[0], 1); -} - -void setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); -int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, void *data, u32 data_size); -int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, u32 *data_size, int nr); - -static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u32 data_size) -{ - return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); -} - -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); -static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) -{ - ++p->slots[0]; - if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_leaf(root, p); - return 0; -} -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); -int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - int update_ref, int for_reloc); -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent); -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) -{ - /* - * Get synced with close_ctree() - */ - smp_mb(); - return fs_info->closing; -} -static inline void free_fs_info(struct btrfs_fs_info *fs_info) -{ - kfree(fs_info->balance_ctl); - kfree(fs_info->delayed_root); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info->super_copy); - kfree(fs_info->super_for_commit); - kfree(fs_info); -} - -/* root-item.c */ -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, - u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const char *name, int name_len); -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, - u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const char *name, int name_len); -int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct - btrfs_root_item *item, struct btrfs_key *key); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); -int btrfs_find_orphan_roots(struct btrfs_root *tree_root); -void btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); -void btrfs_check_and_init_root_item(struct btrfs_root_item *item); - -/* dir-item.c */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, - struct btrfs_key *location, u8 type, u64 index); -struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, int name_len, - int mod); -struct btrfs_dir_item * -btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, - int mod); -struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); -int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_dir_item *di); -int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - const char *name, u16 name_len, - const void *data, u16 data_len); -struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, u16 name_len, - int mod); -int verify_dir_item(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_dir_item *dir_item); - -/* orphan.c */ -int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); - -/* inode-item.c */ -int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index); -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); -struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod); -int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, - struct btrfs_key *location, int mod); - -/* file-item.c */ -int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 logical_offset, u32 *dst); -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding); -int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - u64 bytenr, int mod); -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 file_start, int contig); -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow); -int btrfs_csum_truncate(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - u64 isize); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit); -/* inode.c */ -struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create); - -/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ -#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) -#define ClearPageChecked ClearPageFsMisc -#define SetPageChecked SetPageFsMisc -#define PageChecked PageFsMisc -#endif - -/* This forces readahead on a given range of bytes in an inode */ -static inline void btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, unsigned long req_size) -{ - page_cache_sync_readahead(mapping, ra, file, offset, req_size); -} - -struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct inode *dir, u64 *index); -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len); -int btrfs_add_link(struct btrfs_trans_handle *trans, - struct inode *parent_inode, struct inode *inode, - const char *name, int name_len, int add_backref, u64 index); -int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, u64 objectid, - const char *name, int name_len); -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, u64 new_size, - u32 min_type); - -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, - struct extent_state **cached_state); -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc); -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, unsigned long bio_flags); - -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -int btrfs_readpage(struct file *file, struct page *page); -void btrfs_evict_inode(struct inode *inode); -int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int btrfs_dirty_inode(struct inode *inode); -int btrfs_update_time(struct file *file); -struct inode *btrfs_alloc_inode(struct super_block *sb); -void btrfs_destroy_inode(struct inode *inode); -int btrfs_drop_inode(struct inode *inode); -int btrfs_init_cachep(void); -void btrfs_destroy_cachep(void); -long btrfs_ioctl_trans_end(struct file *file); -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *was_new); -struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 end, - int create); -int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); -int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); -void btrfs_invalidate_inodes(struct btrfs_root *root); -void btrfs_add_delayed_iput(struct inode *inode); -void btrfs_run_delayed_iputs(struct btrfs_root *root); -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_prealloc_file_range_trans(struct inode *inode, - struct btrfs_trans_handle *trans, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -extern const struct dentry_operations btrfs_dentry_operations; - -/* ioctl.c */ -long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -void btrfs_update_iflags(struct inode *inode); -void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); -int btrfs_defrag_file(struct inode *inode, struct file *file, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_pages); -/* file.c */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct inode *inode); -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned); -extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache); -int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end); -int btrfs_release_file(struct inode *inode, struct file *file); -void btrfs_drop_pages(struct page **pages, size_t num_pages); -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, - struct page **pages, size_t num_pages, - loff_t pos, size_t write_bytes, - struct extent_state **cached); - -/* tree-defrag.c */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only); - -/* sysfs.c */ -int btrfs_init_sysfs(void); -void btrfs_exit_sysfs(void); - -/* xattr.c */ -ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); - -/* super.c */ -int btrfs_parse_options(struct btrfs_root *root, char *options); -int btrfs_sync_fs(struct super_block *sb, int wait); -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, - unsigned int line, int errno); - -#define btrfs_abort_transaction(trans, root, errno) \ -do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ -} while (0) - -#define btrfs_std_error(fs_info, errno) \ -do { \ - if ((errno)) \ - __btrfs_std_error((fs_info), __func__, \ - __LINE__, (errno), NULL); \ -} while (0) - -#define btrfs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_std_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - struct btrfs_fs_info *_i = (fs_info); \ - __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ - BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ -} while (0) - -/* acl.c */ -#ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type); -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); -int btrfs_acl_chmod(struct inode *inode); -#else -#define btrfs_get_acl NULL -static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - return 0; -} -static inline int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} -#endif - -/* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); -void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); - -/* scrub.c */ -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly); -void btrfs_scrub_pause(struct btrfs_root *root); -void btrfs_scrub_pause_super(struct btrfs_root *root); -void btrfs_scrub_continue(struct btrfs_root *root); -void btrfs_scrub_continue_super(struct btrfs_root *root); -int __btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel(struct btrfs_root *root); -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); -int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, - struct btrfs_scrub_progress *progress); - -/* reada.c */ -struct reada_control { - struct btrfs_root *root; /* tree to prefetch */ - struct btrfs_key key_start; - struct btrfs_key key_end; /* exclusive */ - atomic_t elems; - struct kref refcnt; - wait_queue_head_t wait; -}; -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *start, struct btrfs_key *end); -int btrfs_reada_wait(void *handle); -void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/delayed-inode.c b/ANDROID_3.4.5/fs/btrfs/delayed-inode.c deleted file mode 100644 index 03e3748d..00000000 --- a/ANDROID_3.4.5/fs/btrfs/delayed-inode.c +++ /dev/null @@ -1,1881 +0,0 @@ -/* - * Copyright (C) 2011 Fujitsu. All rights reserved. - * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/slab.h> -#include "delayed-inode.h" -#include "disk-io.h" -#include "transaction.h" - -#define BTRFS_DELAYED_WRITEBACK 400 -#define BTRFS_DELAYED_BACKGROUND 100 - -static struct kmem_cache *delayed_node_cache; - -int __init btrfs_delayed_inode_init(void) -{ - delayed_node_cache = kmem_cache_create("delayed_node", - sizeof(struct btrfs_delayed_node), - 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!delayed_node_cache) - return -ENOMEM; - return 0; -} - -void btrfs_delayed_inode_exit(void) -{ - if (delayed_node_cache) - kmem_cache_destroy(delayed_node_cache); -} - -static inline void btrfs_init_delayed_node( - struct btrfs_delayed_node *delayed_node, - struct btrfs_root *root, u64 inode_id) -{ - delayed_node->root = root; - delayed_node->inode_id = inode_id; - atomic_set(&delayed_node->refs, 0); - delayed_node->count = 0; - delayed_node->in_list = 0; - delayed_node->inode_dirty = 0; - delayed_node->ins_root = RB_ROOT; - delayed_node->del_root = RB_ROOT; - mutex_init(&delayed_node->mutex); - delayed_node->index_cnt = 0; - INIT_LIST_HEAD(&delayed_node->n_list); - INIT_LIST_HEAD(&delayed_node->p_list); - delayed_node->bytes_reserved = 0; -} - -static inline int btrfs_is_continuous_delayed_item( - struct btrfs_delayed_item *item1, - struct btrfs_delayed_item *item2) -{ - if (item1->key.type == BTRFS_DIR_INDEX_KEY && - item1->key.objectid == item2->key.objectid && - item1->key.type == item2->key.type && - item1->key.offset + 1 == item2->key.offset) - return 1; - return 0; -} - -static inline struct btrfs_delayed_root *btrfs_get_delayed_root( - struct btrfs_root *root) -{ - return root->fs_info->delayed_root; -} - -static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - struct btrfs_root *root = btrfs_inode->root; - u64 ino = btrfs_ino(inode); - struct btrfs_delayed_node *node; - - node = ACCESS_ONCE(btrfs_inode->delayed_node); - if (node) { - atomic_inc(&node->refs); - return node; - } - - spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); - if (node) { - if (btrfs_inode->delayed_node) { - atomic_inc(&node->refs); /* can be accessed */ - BUG_ON(btrfs_inode->delayed_node != node); - spin_unlock(&root->inode_lock); - return node; - } - btrfs_inode->delayed_node = node; - atomic_inc(&node->refs); /* can be accessed */ - atomic_inc(&node->refs); /* cached in the inode */ - spin_unlock(&root->inode_lock); - return node; - } - spin_unlock(&root->inode_lock); - - return NULL; -} - -/* Will return either the node or PTR_ERR(-ENOMEM) */ -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct inode *inode) -{ - struct btrfs_delayed_node *node; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - struct btrfs_root *root = btrfs_inode->root; - u64 ino = btrfs_ino(inode); - int ret; - -again: - node = btrfs_get_delayed_node(inode); - if (node) - return node; - - node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); - - atomic_inc(&node->refs); /* cached in the btrfs inode */ - atomic_inc(&node->refs); /* can be accessed */ - - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) { - kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); - } - - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { - kmem_cache_free(delayed_node_cache, node); - spin_unlock(&root->inode_lock); - radix_tree_preload_end(); - goto again; - } - btrfs_inode->delayed_node = node; - spin_unlock(&root->inode_lock); - radix_tree_preload_end(); - - return node; -} - -/* - * Call it when holding delayed_node->mutex - * - * If mod = 1, add this node into the prepared list. - */ -static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, - struct btrfs_delayed_node *node, - int mod) -{ - spin_lock(&root->lock); - if (node->in_list) { - if (!list_empty(&node->p_list)) - list_move_tail(&node->p_list, &root->prepare_list); - else if (mod) - list_add_tail(&node->p_list, &root->prepare_list); - } else { - list_add_tail(&node->n_list, &root->node_list); - list_add_tail(&node->p_list, &root->prepare_list); - atomic_inc(&node->refs); /* inserted into list */ - root->nodes++; - node->in_list = 1; - } - spin_unlock(&root->lock); -} - -/* Call it when holding delayed_node->mutex */ -static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, - struct btrfs_delayed_node *node) -{ - spin_lock(&root->lock); - if (node->in_list) { - root->nodes--; - atomic_dec(&node->refs); /* not in the list */ - list_del_init(&node->n_list); - if (!list_empty(&node->p_list)) - list_del_init(&node->p_list); - node->in_list = 0; - } - spin_unlock(&root->lock); -} - -struct btrfs_delayed_node *btrfs_first_delayed_node( - struct btrfs_delayed_root *delayed_root) -{ - struct list_head *p; - struct btrfs_delayed_node *node = NULL; - - spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->node_list)) - goto out; - - p = delayed_root->node_list.next; - node = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&node->refs); -out: - spin_unlock(&delayed_root->lock); - - return node; -} - -struct btrfs_delayed_node *btrfs_next_delayed_node( - struct btrfs_delayed_node *node) -{ - struct btrfs_delayed_root *delayed_root; - struct list_head *p; - struct btrfs_delayed_node *next = NULL; - - delayed_root = node->root->fs_info->delayed_root; - spin_lock(&delayed_root->lock); - if (!node->in_list) { /* not in the list */ - if (list_empty(&delayed_root->node_list)) - goto out; - p = delayed_root->node_list.next; - } else if (list_is_last(&node->n_list, &delayed_root->node_list)) - goto out; - else - p = node->n_list.next; - - next = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&next->refs); -out: - spin_unlock(&delayed_root->lock); - - return next; -} - -static void __btrfs_release_delayed_node( - struct btrfs_delayed_node *delayed_node, - int mod) -{ - struct btrfs_delayed_root *delayed_root; - - if (!delayed_node) - return; - - delayed_root = delayed_node->root->fs_info->delayed_root; - - mutex_lock(&delayed_node->mutex); - if (delayed_node->count) - btrfs_queue_delayed_node(delayed_root, delayed_node, mod); - else - btrfs_dequeue_delayed_node(delayed_root, delayed_node); - mutex_unlock(&delayed_node->mutex); - - if (atomic_dec_and_test(&delayed_node->refs)) { - struct btrfs_root *root = delayed_node->root; - spin_lock(&root->inode_lock); - if (atomic_read(&delayed_node->refs) == 0) { - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); - kmem_cache_free(delayed_node_cache, delayed_node); - } - spin_unlock(&root->inode_lock); - } -} - -static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) -{ - __btrfs_release_delayed_node(node, 0); -} - -struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( - struct btrfs_delayed_root *delayed_root) -{ - struct list_head *p; - struct btrfs_delayed_node *node = NULL; - - spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->prepare_list)) - goto out; - - p = delayed_root->prepare_list.next; - list_del_init(p); - node = list_entry(p, struct btrfs_delayed_node, p_list); - atomic_inc(&node->refs); -out: - spin_unlock(&delayed_root->lock); - - return node; -} - -static inline void btrfs_release_prepared_delayed_node( - struct btrfs_delayed_node *node) -{ - __btrfs_release_delayed_node(node, 1); -} - -struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) -{ - struct btrfs_delayed_item *item; - item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); - if (item) { - item->data_len = data_len; - item->ins_or_del = 0; - item->bytes_reserved = 0; - item->delayed_node = NULL; - atomic_set(&item->refs, 1); - } - return item; -} - -/* - * __btrfs_lookup_delayed_item - look up the delayed item by key - * @delayed_node: pointer to the delayed node - * @key: the key to look up - * @prev: used to store the prev item if the right item isn't found - * @next: used to store the next item if the right item isn't found - * - * Note: if we don't find the right item, we will return the prev item and - * the next item. - */ -static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( - struct rb_root *root, - struct btrfs_key *key, - struct btrfs_delayed_item **prev, - struct btrfs_delayed_item **next) -{ - struct rb_node *node, *prev_node = NULL; - struct btrfs_delayed_item *delayed_item = NULL; - int ret = 0; - - node = root->rb_node; - - while (node) { - delayed_item = rb_entry(node, struct btrfs_delayed_item, - rb_node); - prev_node = node; - ret = btrfs_comp_cpu_keys(&delayed_item->key, key); - if (ret < 0) - node = node->rb_right; - else if (ret > 0) - node = node->rb_left; - else - return delayed_item; - } - - if (prev) { - if (!prev_node) - *prev = NULL; - else if (ret < 0) - *prev = delayed_item; - else if ((node = rb_prev(prev_node)) != NULL) { - *prev = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *prev = NULL; - } - - if (next) { - if (!prev_node) - *next = NULL; - else if (ret > 0) - *next = delayed_item; - else if ((node = rb_next(prev_node)) != NULL) { - *next = rb_entry(node, struct btrfs_delayed_item, - rb_node); - } else - *next = NULL; - } - return NULL; -} - -struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item; - - item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, - NULL, NULL); - return item; -} - -struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item; - - item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, - NULL, NULL); - return item; -} - -struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item, *next; - - item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, - NULL, &next); - if (!item) - item = next; - - return item; -} - -struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item, *next; - - item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, - NULL, &next); - if (!item) - item = next; - - return item; -} - -static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, - struct btrfs_delayed_item *ins, - int action) -{ - struct rb_node **p, *node; - struct rb_node *parent_node = NULL; - struct rb_root *root; - struct btrfs_delayed_item *item; - int cmp; - - if (action == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_node->ins_root; - else if (action == BTRFS_DELAYED_DELETION_ITEM) - root = &delayed_node->del_root; - else - BUG(); - p = &root->rb_node; - node = &ins->rb_node; - - while (*p) { - parent_node = *p; - item = rb_entry(parent_node, struct btrfs_delayed_item, - rb_node); - - cmp = btrfs_comp_cpu_keys(&item->key, &ins->key); - if (cmp < 0) - p = &(*p)->rb_right; - else if (cmp > 0) - p = &(*p)->rb_left; - else - return -EEXIST; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - ins->delayed_node = delayed_node; - ins->ins_or_del = action; - - if (ins->key.type == BTRFS_DIR_INDEX_KEY && - action == BTRFS_DELAYED_INSERTION_ITEM && - ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; - - delayed_node->count++; - atomic_inc(&delayed_node->root->fs_info->delayed_root->items); - return 0; -} - -static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_INSERTION_ITEM); -} - -static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_DELETION_ITEM); -} - -static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) -{ - struct rb_root *root; - struct btrfs_delayed_root *delayed_root; - - delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; - - BUG_ON(!delayed_root); - BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM && - delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM); - - if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_item->delayed_node->ins_root; - else - root = &delayed_item->delayed_node->del_root; - - rb_erase(&delayed_item->rb_node, root); - delayed_item->delayed_node->count--; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); -} - -static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) -{ - if (item) { - __btrfs_remove_delayed_item(item); - if (atomic_dec_and_test(&item->refs)) - kfree(item); - } -} - -struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node) -{ - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; - - p = rb_first(&delayed_node->ins_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; -} - -struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node) -{ - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; - - p = rb_first(&delayed_node->del_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; -} - -struct btrfs_delayed_item *__btrfs_next_delayed_item( - struct btrfs_delayed_item *item) -{ - struct rb_node *p; - struct btrfs_delayed_item *next = NULL; - - p = rb_next(&item->rb_node); - if (p) - next = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return next; -} - -static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, - u64 root_id) -{ - struct btrfs_key root_key; - - if (root->objectid == root_id) - return root; - - root_key.objectid = root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(root->fs_info, &root_key); -} - -static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_item *item) -{ - struct btrfs_block_rsv *src_rsv; - struct btrfs_block_rsv *dst_rsv; - u64 num_bytes; - int ret; - - if (!trans->bytes_reserved) - return 0; - - src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->delayed_block_rsv; - - num_bytes = btrfs_calc_trans_metadata_size(root, 1); - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "delayed_item", - item->key.objectid, - num_bytes, 1); - item->bytes_reserved = num_bytes; - } - - return ret; -} - -static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, - struct btrfs_delayed_item *item) -{ - struct btrfs_block_rsv *rsv; - - if (!item->bytes_reserved) - return; - - rsv = &root->fs_info->delayed_block_rsv; - trace_btrfs_space_reservation(root->fs_info, "delayed_item", - item->key.objectid, item->bytes_reserved, - 0); - btrfs_block_rsv_release(root, rsv, - item->bytes_reserved); -} - -static int btrfs_delayed_inode_reserve_metadata( - struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - struct btrfs_delayed_node *node) -{ - struct btrfs_block_rsv *src_rsv; - struct btrfs_block_rsv *dst_rsv; - u64 num_bytes; - int ret; - bool release = false; - - src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->delayed_block_rsv; - - num_bytes = btrfs_calc_trans_metadata_size(root, 1); - - /* - * btrfs_dirty_inode will update the inode under btrfs_join_transaction - * which doesn't reserve space for speed. This is a problem since we - * still need to reserve space for this update, so try to reserve the - * space. - * - * Now if src_rsv == delalloc_block_rsv we'll let it just steal since - * we're accounted for. - */ - if (!src_rsv || (!trans->bytes_reserved && - src_rsv != &root->fs_info->delalloc_block_rsv)) { - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); - /* - * Since we're under a transaction reserve_metadata_bytes could - * try to commit the transaction which will make it return - * EAGAIN to make us stop the transaction we have, so return - * ENOSPC instead so that btrfs_dirty_inode knows what to do. - */ - if (ret == -EAGAIN) - ret = -ENOSPC; - if (!ret) { - node->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(root->fs_info, - "delayed_inode", - btrfs_ino(inode), - num_bytes, 1); - } - return ret; - } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { - spin_lock(&BTRFS_I(inode)->lock); - if (BTRFS_I(inode)->delalloc_meta_reserved) { - BTRFS_I(inode)->delalloc_meta_reserved = 0; - spin_unlock(&BTRFS_I(inode)->lock); - release = true; - goto migrate; - } - spin_unlock(&BTRFS_I(inode)->lock); - - /* Ok we didn't have space pre-reserved. This shouldn't happen - * too often but it can happen if we do delalloc to an existing - * inode which gets dirtied because of the time update, and then - * isn't touched again until after the transaction commits and - * then we try to write out the data. First try to be nice and - * reserve something strictly for us. If not be a pain and try - * to steal from the delalloc block rsv. - */ - ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); - if (!ret) - goto out; - - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - goto out; - - /* - * Ok this is a problem, let's just steal from the global rsv - * since this really shouldn't happen that often. - */ - WARN_ON(1); - ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, - dst_rsv, num_bytes); - goto out; - } - -migrate: - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - -out: - /* - * Migrate only takes a reservation, it doesn't touch the size of the - * block_rsv. This is to simplify people who don't normally have things - * migrated from their block rsv. If they go to release their - * reservation, that will decrease the size as well, so if migrate - * reduced size we'd end up with a negative size. But for the - * delalloc_meta_reserved stuff we will only know to drop 1 reservation, - * but we could in fact do this reserve/migrate dance several times - * between the time we did the original reservation and we'd clean it - * up. So to take care of this, release the space for the meta - * reservation here. I think it may be time for a documentation page on - * how block rsvs. work. - */ - if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "delayed_inode", - btrfs_ino(inode), num_bytes, 1); - node->bytes_reserved = num_bytes; - } - - if (release) { - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(root, src_rsv, num_bytes); - } - - return ret; -} - -static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, - struct btrfs_delayed_node *node) -{ - struct btrfs_block_rsv *rsv; - - if (!node->bytes_reserved) - return; - - rsv = &root->fs_info->delayed_block_rsv; - trace_btrfs_space_reservation(root->fs_info, "delayed_inode", - node->inode_id, node->bytes_reserved, 0); - btrfs_block_rsv_release(root, rsv, - node->bytes_reserved); - node->bytes_reserved = 0; -} - -/* - * This helper will insert some continuous items into the same leaf according - * to the free space of the leaf. - */ -static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *item) -{ - struct btrfs_delayed_item *curr, *next; - int free_space; - int total_data_size = 0, total_size = 0; - struct extent_buffer *leaf; - char *data_ptr; - struct btrfs_key *keys; - u32 *data_size; - struct list_head head; - int slot; - int nitems; - int i; - int ret = 0; - - BUG_ON(!path->nodes[0]); - - leaf = path->nodes[0]; - free_space = btrfs_leaf_free_space(root, leaf); - INIT_LIST_HEAD(&head); - - next = item; - nitems = 0; - - /* - * count the number of the continuous items that we can insert in batch - */ - while (total_size + next->data_len + sizeof(struct btrfs_item) <= - free_space) { - total_data_size += next->data_len; - total_size += next->data_len + sizeof(struct btrfs_item); - list_add_tail(&next->tree_list, &head); - nitems++; - - curr = next; - next = __btrfs_next_delayed_item(curr); - if (!next) - break; - - if (!btrfs_is_continuous_delayed_item(curr, next)) - break; - } - - if (!nitems) { - ret = 0; - goto out; - } - - /* - * we need allocate some memory space, but it might cause the task - * to sleep, so we set all locked nodes in the path to blocking locks - * first. - */ - btrfs_set_path_blocking(path); - - keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS); - if (!keys) { - ret = -ENOMEM; - goto out; - } - - data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS); - if (!data_size) { - ret = -ENOMEM; - goto error; - } - - /* get keys of all the delayed items */ - i = 0; - list_for_each_entry(next, &head, tree_list) { - keys[i] = next->key; - data_size[i] = next->data_len; - i++; - } - - /* reset all the locked nodes in the patch to spinning locks. */ - btrfs_clear_path_blocking(path, NULL, 0); - - /* insert the keys of the items */ - setup_items_for_insert(trans, root, path, keys, data_size, - total_data_size, total_size, nitems); - - /* insert the dir index items */ - slot = path->slots[0]; - list_for_each_entry_safe(curr, next, &head, tree_list) { - data_ptr = btrfs_item_ptr(leaf, slot, char); - write_extent_buffer(leaf, &curr->data, - (unsigned long)data_ptr, - curr->data_len); - slot++; - - btrfs_delayed_item_release_metadata(root, curr); - - list_del(&curr->tree_list); - btrfs_release_delayed_item(curr); - } - -error: - kfree(data_size); - kfree(keys); -out: - return ret; -} - -/* - * This helper can just do simple insertion that needn't extend item for new - * data, such as directory name index insertion, inode insertion. - */ -static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *delayed_item) -{ - struct extent_buffer *leaf; - struct btrfs_item *item; - char *ptr; - int ret; - - ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, - delayed_item->data_len); - if (ret < 0 && ret != -EEXIST) - return ret; - - leaf = path->nodes[0]; - - item = btrfs_item_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr(leaf, path->slots[0], char); - - write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, - delayed_item->data_len); - btrfs_mark_buffer_dirty(leaf); - - btrfs_delayed_item_release_metadata(root, delayed_item); - return 0; -} - -/* - * we insert an item first, then if there are some continuous items, we try - * to insert those items into the same leaf. - */ -static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *root, - struct btrfs_delayed_node *node) -{ - struct btrfs_delayed_item *curr, *prev; - int ret = 0; - -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_insertion_item(node); - if (!curr) - goto insert_end; - - ret = btrfs_insert_delayed_item(trans, root, path, curr); - if (ret < 0) { - btrfs_release_path(path); - goto insert_end; - } - - prev = curr; - curr = __btrfs_next_delayed_item(prev); - if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { - /* insert the continuous items into the same leaf */ - path->slots[0]++; - btrfs_batch_insert_items(trans, root, path, curr); - } - btrfs_release_delayed_item(prev); - btrfs_mark_buffer_dirty(path->nodes[0]); - - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -insert_end: - mutex_unlock(&node->mutex); - return ret; -} - -static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *item) -{ - struct btrfs_delayed_item *curr, *next; - struct extent_buffer *leaf; - struct btrfs_key key; - struct list_head head; - int nitems, i, last_item; - int ret = 0; - - BUG_ON(!path->nodes[0]); - - leaf = path->nodes[0]; - - i = path->slots[0]; - last_item = btrfs_header_nritems(leaf) - 1; - if (i > last_item) - return -ENOENT; /* FIXME: Is errno suitable? */ - - next = item; - INIT_LIST_HEAD(&head); - btrfs_item_key_to_cpu(leaf, &key, i); - nitems = 0; - /* - * count the number of the dir index items that we can delete in batch - */ - while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { - list_add_tail(&next->tree_list, &head); - nitems++; - - curr = next; - next = __btrfs_next_delayed_item(curr); - if (!next) - break; - - if (!btrfs_is_continuous_delayed_item(curr, next)) - break; - - i++; - if (i > last_item) - break; - btrfs_item_key_to_cpu(leaf, &key, i); - } - - if (!nitems) - return 0; - - ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); - if (ret) - goto out; - - list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); - list_del(&curr->tree_list); - btrfs_release_delayed_item(curr); - } - -out: - return ret; -} - -static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_root *root, - struct btrfs_delayed_node *node) -{ - struct btrfs_delayed_item *curr, *prev; - int ret = 0; - -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_deletion_item(node); - if (!curr) - goto delete_fail; - - ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - if (ret < 0) - goto delete_fail; - else if (ret > 0) { - /* - * can't find the item which the node points to, so this node - * is invalid, just drop it. - */ - prev = curr; - curr = __btrfs_next_delayed_item(prev); - btrfs_release_delayed_item(prev); - ret = 0; - btrfs_release_path(path); - if (curr) - goto do_again; - else - goto delete_fail; - } - - btrfs_batch_delete_items(trans, root, path, curr); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -delete_fail: - btrfs_release_path(path); - mutex_unlock(&node->mutex); - return ret; -} - -static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) -{ - struct btrfs_delayed_root *delayed_root; - - if (delayed_node && delayed_node->inode_dirty) { - BUG_ON(!delayed_node->root); - delayed_node->inode_dirty = 0; - delayed_node->count--; - - delayed_root = delayed_node->root->fs_info->delayed_root; - atomic_dec(&delayed_root->items); - if (atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND && - waitqueue_active(&delayed_root->wait)) - wake_up(&delayed_root->wait); - } -} - -static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_node *node) -{ - struct btrfs_key key; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - int ret; - - mutex_lock(&node->mutex); - if (!node->inode_dirty) { - mutex_unlock(&node->mutex); - return 0; - } - - key.objectid = node->inode_id; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - ret = btrfs_lookup_inode(trans, root, path, &key, 1); - if (ret > 0) { - btrfs_release_path(path); - mutex_unlock(&node->mutex); - return -ENOENT; - } else if (ret < 0) { - mutex_unlock(&node->mutex); - return ret; - } - - btrfs_unlock_up_safe(path, 1); - leaf = path->nodes[0]; - inode_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_inode_item); - write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, - sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - btrfs_delayed_inode_release_metadata(root, node); - btrfs_release_delayed_inode(node); - mutex_unlock(&node->mutex); - - return 0; -} - -/* - * Called when committing the transaction. - * Returns 0 on success. - * Returns < 0 on error and returns with an aborted transaction with any - * outstanding delayed items cleaned up. - */ -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *curr_root = root; - struct btrfs_delayed_root *delayed_root; - struct btrfs_delayed_node *curr_node, *prev_node; - struct btrfs_path *path; - struct btrfs_block_rsv *block_rsv; - int ret = 0; - - if (trans->aborted) - return -EIO; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; - - delayed_root = btrfs_get_delayed_root(root); - - curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node) { - curr_root = curr_node->root; - ret = btrfs_insert_delayed_items(trans, path, curr_root, - curr_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, - curr_root, curr_node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, curr_root, - path, curr_node); - if (ret) { - btrfs_release_delayed_node(curr_node); - btrfs_abort_transaction(trans, root, ret); - break; - } - - prev_node = curr_node; - curr_node = btrfs_next_delayed_node(curr_node); - btrfs_release_delayed_node(prev_node); - } - - btrfs_free_path(path); - trans->block_rsv = block_rsv; - - return ret; -} - -static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_delayed_node *node) -{ - struct btrfs_path *path; - struct btrfs_block_rsv *block_rsv; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->delayed_block_rsv; - - ret = btrfs_insert_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, node->root, node); - if (!ret) - ret = btrfs_update_delayed_inode(trans, node->root, path, node); - btrfs_free_path(path); - - trans->block_rsv = block_rsv; - return ret; -} - -int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - int ret; - - if (!delayed_node) - return 0; - - mutex_lock(&delayed_node->mutex); - if (!delayed_node->count) { - mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); - return 0; - } - mutex_unlock(&delayed_node->mutex); - - ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); - btrfs_release_delayed_node(delayed_node); - return ret; -} - -void btrfs_remove_delayed_node(struct inode *inode) -{ - struct btrfs_delayed_node *delayed_node; - - delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node); - if (!delayed_node) - return; - - BTRFS_I(inode)->delayed_node = NULL; - btrfs_release_delayed_node(delayed_node); -} - -struct btrfs_async_delayed_node { - struct btrfs_root *root; - struct btrfs_delayed_node *delayed_node; - struct btrfs_work work; -}; - -static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) -{ - struct btrfs_async_delayed_node *async_node; - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_delayed_node *delayed_node = NULL; - struct btrfs_root *root; - struct btrfs_block_rsv *block_rsv; - unsigned long nr = 0; - int need_requeue = 0; - int ret; - - async_node = container_of(work, struct btrfs_async_delayed_node, work); - - path = btrfs_alloc_path(); - if (!path) - goto out; - path->leave_spinning = 1; - - delayed_node = async_node->delayed_node; - root = delayed_node->root; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto free_path; - - block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; - - ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); - if (!ret) - ret = btrfs_delete_delayed_items(trans, path, root, - delayed_node); - - if (!ret) - btrfs_update_delayed_inode(trans, root, path, delayed_node); - - /* - * Maybe new delayed items have been inserted, so we need requeue - * the work. Besides that, we must dequeue the empty delayed nodes - * to avoid the race between delayed items balance and the worker. - * The race like this: - * Task1 Worker thread - * count == 0, needn't requeue - * also needn't insert the - * delayed node into prepare - * list again. - * add lots of delayed items - * queue the delayed node - * already in the list, - * and not in the prepare - * list, it means the delayed - * node is being dealt with - * by the worker. - * do delayed items balance - * the delayed node is being - * dealt with by the worker - * now, just wait. - * the worker goto idle. - * Task1 will sleep until the transaction is commited. - */ - mutex_lock(&delayed_node->mutex); - if (delayed_node->count) - need_requeue = 1; - else - btrfs_dequeue_delayed_node(root->fs_info->delayed_root, - delayed_node); - mutex_unlock(&delayed_node->mutex); - - nr = trans->blocks_used; - - trans->block_rsv = block_rsv; - btrfs_end_transaction_dmeta(trans, root); - __btrfs_btree_balance_dirty(root, nr); -free_path: - btrfs_free_path(path); -out: - if (need_requeue) - btrfs_requeue_work(&async_node->work); - else { - btrfs_release_prepared_delayed_node(delayed_node); - kfree(async_node); - } -} - -static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int all) -{ - struct btrfs_async_delayed_node *async_node; - struct btrfs_delayed_node *curr; - int count = 0; - -again: - curr = btrfs_first_prepared_delayed_node(delayed_root); - if (!curr) - return 0; - - async_node = kmalloc(sizeof(*async_node), GFP_NOFS); - if (!async_node) { - btrfs_release_prepared_delayed_node(curr); - return -ENOMEM; - } - - async_node->root = root; - async_node->delayed_node = curr; - - async_node->work.func = btrfs_async_run_delayed_node_done; - async_node->work.flags = 0; - - btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work); - count++; - - if (all || count < 4) - goto again; - - return 0; -} - -void btrfs_assert_delayed_root_empty(struct btrfs_root *root) -{ - struct btrfs_delayed_root *delayed_root; - delayed_root = btrfs_get_delayed_root(root); - WARN_ON(btrfs_first_delayed_node(delayed_root)); -} - -void btrfs_balance_delayed_items(struct btrfs_root *root) -{ - struct btrfs_delayed_root *delayed_root; - - delayed_root = btrfs_get_delayed_root(root); - - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) - return; - - if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { - int ret; - ret = btrfs_wq_run_delayed_node(delayed_root, root, 1); - if (ret) - return; - - wait_event_interruptible_timeout( - delayed_root->wait, - (atomic_read(&delayed_root->items) < - BTRFS_DELAYED_BACKGROUND), - HZ); - return; - } - - btrfs_wq_run_delayed_node(delayed_root, root, 0); -} - -/* Will return 0 or -ENOMEM */ -int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, - struct btrfs_disk_key *disk_key, u8 type, - u64 index) -{ - struct btrfs_delayed_node *delayed_node; - struct btrfs_delayed_item *delayed_item; - struct btrfs_dir_item *dir_item; - int ret; - - delayed_node = btrfs_get_or_create_delayed_node(dir); - if (IS_ERR(delayed_node)) - return PTR_ERR(delayed_node); - - delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len); - if (!delayed_item) { - ret = -ENOMEM; - goto release_node; - } - - delayed_item->key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); - delayed_item->key.offset = index; - - dir_item = (struct btrfs_dir_item *)delayed_item->data; - dir_item->location = *disk_key; - dir_item->transid = cpu_to_le64(trans->transid); - dir_item->data_len = 0; - dir_item->name_len = cpu_to_le16(name_len); - dir_item->type = type; - memcpy((char *)(dir_item + 1), name, name_len); - - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); - - - mutex_lock(&delayed_node->mutex); - ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); - if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(name: %s) into " - "the insertion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", - name, - (unsigned long long)delayed_node->root->objectid, - (unsigned long long)delayed_node->inode_id, - ret); - BUG(); - } - mutex_unlock(&delayed_node->mutex); - -release_node: - btrfs_release_delayed_node(delayed_node); - return ret; -} - -static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root, - struct btrfs_delayed_node *node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item; - - mutex_lock(&node->mutex); - item = __btrfs_lookup_delayed_insertion_item(node, key); - if (!item) { - mutex_unlock(&node->mutex); - return 1; - } - - btrfs_delayed_item_release_metadata(root, item); - btrfs_release_delayed_item(item); - mutex_unlock(&node->mutex); - return 0; -} - -int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *dir, - u64 index) -{ - struct btrfs_delayed_node *node; - struct btrfs_delayed_item *item; - struct btrfs_key item_key; - int ret; - - node = btrfs_get_or_create_delayed_node(dir); - if (IS_ERR(node)) - return PTR_ERR(node); - - item_key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); - item_key.offset = index; - - ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); - if (!ret) - goto end; - - item = btrfs_alloc_delayed_item(0); - if (!item) { - ret = -ENOMEM; - goto end; - } - - item->key = item_key; - - ret = btrfs_delayed_item_reserve_metadata(trans, root, item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible. - */ - BUG_ON(ret); - - mutex_lock(&node->mutex); - ret = __btrfs_add_delayed_deletion_item(node, item); - if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(index: %llu) " - "into the deletion tree of the delayed node" - "(root id: %llu, inode id: %llu, errno: %d)\n", - (unsigned long long)index, - (unsigned long long)node->root->objectid, - (unsigned long long)node->inode_id, - ret); - BUG(); - } - mutex_unlock(&node->mutex); -end: - btrfs_release_delayed_node(node); - return ret; -} - -int btrfs_inode_delayed_dir_index_count(struct inode *inode) -{ - struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - - if (!delayed_node) - return -ENOENT; - - /* - * Since we have held i_mutex of this directory, it is impossible that - * a new directory index is added into the delayed node and index_cnt - * is updated now. So we needn't lock the delayed node. - */ - if (!delayed_node->index_cnt) { - btrfs_release_delayed_node(delayed_node); - return -EINVAL; - } - - BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; - btrfs_release_delayed_node(delayed_node); - return 0; -} - -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list) -{ - struct btrfs_delayed_node *delayed_node; - struct btrfs_delayed_item *item; - - delayed_node = btrfs_get_delayed_node(inode); - if (!delayed_node) - return; - - mutex_lock(&delayed_node->mutex); - item = __btrfs_first_delayed_insertion_item(delayed_node); - while (item) { - atomic_inc(&item->refs); - list_add_tail(&item->readdir_list, ins_list); - item = __btrfs_next_delayed_item(item); - } - - item = __btrfs_first_delayed_deletion_item(delayed_node); - while (item) { - atomic_inc(&item->refs); - list_add_tail(&item->readdir_list, del_list); - item = __btrfs_next_delayed_item(item); - } - mutex_unlock(&delayed_node->mutex); - /* - * This delayed node is still cached in the btrfs inode, so refs - * must be > 1 now, and we needn't check it is going to be freed - * or not. - * - * Besides that, this function is used to read dir, we do not - * insert/delete delayed items in this period. So we also needn't - * requeue or dequeue this delayed node. - */ - atomic_dec(&delayed_node->refs); -} - -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list) -{ - struct btrfs_delayed_item *curr, *next; - - list_for_each_entry_safe(curr, next, ins_list, readdir_list) { - list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) - kfree(curr); - } - - list_for_each_entry_safe(curr, next, del_list, readdir_list) { - list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) - kfree(curr); - } -} - -int btrfs_should_delete_dir_index(struct list_head *del_list, - u64 index) -{ - struct btrfs_delayed_item *curr, *next; - int ret; - - if (list_empty(del_list)) - return 0; - - list_for_each_entry_safe(curr, next, del_list, readdir_list) { - if (curr->key.offset > index) - break; - - list_del(&curr->readdir_list); - ret = (curr->key.offset == index); - - if (atomic_dec_and_test(&curr->refs)) - kfree(curr); - - if (ret) - return 1; - else - continue; - } - return 0; -} - -/* - * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree - * - */ -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, - struct list_head *ins_list) -{ - struct btrfs_dir_item *di; - struct btrfs_delayed_item *curr, *next; - struct btrfs_key location; - char *name; - int name_len; - int over = 0; - unsigned char d_type; - - if (list_empty(ins_list)) - return 0; - - /* - * Changing the data of the delayed item is impossible. So - * we needn't lock them. And we have held i_mutex of the - * directory, nobody can delete any directory indexes now. - */ - list_for_each_entry_safe(curr, next, ins_list, readdir_list) { - list_del(&curr->readdir_list); - - if (curr->key.offset < filp->f_pos) { - if (atomic_dec_and_test(&curr->refs)) - kfree(curr); - continue; - } - - filp->f_pos = curr->key.offset; - - di = (struct btrfs_dir_item *)curr->data; - name = (char *)(di + 1); - name_len = le16_to_cpu(di->name_len); - - d_type = btrfs_filetype_table[di->type]; - btrfs_disk_key_to_cpu(&location, &di->location); - - over = filldir(dirent, name, name_len, curr->key.offset, - location.objectid, d_type); - - if (atomic_dec_and_test(&curr->refs)) - kfree(curr); - - if (over) - return 1; - } - return 0; -} - -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - -static void fill_stack_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_inode_item *inode_item, - struct inode *inode) -{ - btrfs_set_stack_inode_uid(inode_item, inode->i_uid); - btrfs_set_stack_inode_gid(inode_item, inode->i_gid); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); - btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence); - btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); - btrfs_set_stack_inode_block_group(inode_item, 0); - - btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), - inode->i_atime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), - inode->i_atime.tv_nsec); - - btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), - inode->i_mtime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), - inode->i_mtime.tv_nsec); - - btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), - inode->i_ctime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), - inode->i_ctime.tv_nsec); -} - -int btrfs_fill_inode(struct inode *inode, u32 *rdev) -{ - struct btrfs_delayed_node *delayed_node; - struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; - - delayed_node = btrfs_get_delayed_node(inode); - if (!delayed_node) - return -ENOENT; - - mutex_lock(&delayed_node->mutex); - if (!delayed_node->inode_dirty) { - mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); - return -ENOENT; - } - - inode_item = &delayed_node->inode_item; - - inode->i_uid = btrfs_stack_inode_uid(inode_item); - inode->i_gid = btrfs_stack_inode_gid(inode_item); - btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); - inode->i_rdev = 0; - *rdev = btrfs_stack_inode_rdev(inode_item); - BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); - - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); - - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); - - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); - - inode->i_generation = BTRFS_I(inode)->generation; - BTRFS_I(inode)->index_cnt = (u64)-1; - - mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); - return 0; -} - -int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - struct btrfs_delayed_node *delayed_node; - int ret = 0; - - delayed_node = btrfs_get_or_create_delayed_node(inode); - if (IS_ERR(delayed_node)) - return PTR_ERR(delayed_node); - - mutex_lock(&delayed_node->mutex); - if (delayed_node->inode_dirty) { - fill_stack_inode_item(trans, &delayed_node->inode_item, inode); - goto release_node; - } - - ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, - delayed_node); - if (ret) - goto release_node; - - fill_stack_inode_item(trans, &delayed_node->inode_item, inode); - delayed_node->inode_dirty = 1; - delayed_node->count++; - atomic_inc(&root->fs_info->delayed_root->items); -release_node: - mutex_unlock(&delayed_node->mutex); - btrfs_release_delayed_node(delayed_node); - return ret; -} - -static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) -{ - struct btrfs_root *root = delayed_node->root; - struct btrfs_delayed_item *curr_item, *prev_item; - - mutex_lock(&delayed_node->mutex); - curr_item = __btrfs_first_delayed_insertion_item(delayed_node); - while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); - prev_item = curr_item; - curr_item = __btrfs_next_delayed_item(prev_item); - btrfs_release_delayed_item(prev_item); - } - - curr_item = __btrfs_first_delayed_deletion_item(delayed_node); - while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); - prev_item = curr_item; - curr_item = __btrfs_next_delayed_item(prev_item); - btrfs_release_delayed_item(prev_item); - } - - if (delayed_node->inode_dirty) { - btrfs_delayed_inode_release_metadata(root, delayed_node); - btrfs_release_delayed_inode(delayed_node); - } - mutex_unlock(&delayed_node->mutex); -} - -void btrfs_kill_delayed_inode_items(struct inode *inode) -{ - struct btrfs_delayed_node *delayed_node; - - delayed_node = btrfs_get_delayed_node(inode); - if (!delayed_node) - return; - - __btrfs_kill_delayed_node(delayed_node); - btrfs_release_delayed_node(delayed_node); -} - -void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) -{ - u64 inode_id = 0; - struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; - - while (1) { - spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { - spin_unlock(&root->inode_lock); - break; - } - - inode_id = delayed_nodes[n - 1]->inode_id + 1; - - for (i = 0; i < n; i++) - atomic_inc(&delayed_nodes[i]->refs); - spin_unlock(&root->inode_lock); - - for (i = 0; i < n; i++) { - __btrfs_kill_delayed_node(delayed_nodes[i]); - btrfs_release_delayed_node(delayed_nodes[i]); - } - } -} diff --git a/ANDROID_3.4.5/fs/btrfs/delayed-inode.h b/ANDROID_3.4.5/fs/btrfs/delayed-inode.h deleted file mode 100644 index 7083d08b..00000000 --- a/ANDROID_3.4.5/fs/btrfs/delayed-inode.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (C) 2011 Fujitsu. All rights reserved. - * Written by Miao Xie <miaox@cn.fujitsu.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __DELAYED_TREE_OPERATION_H -#define __DELAYED_TREE_OPERATION_H - -#include <linux/rbtree.h> -#include <linux/spinlock.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/wait.h> -#include <linux/atomic.h> - -#include "ctree.h" - -/* types of the delayed item */ -#define BTRFS_DELAYED_INSERTION_ITEM 1 -#define BTRFS_DELAYED_DELETION_ITEM 2 - -struct btrfs_delayed_root { - spinlock_t lock; - struct list_head node_list; - /* - * Used for delayed nodes which is waiting to be dealt with by the - * worker. If the delayed node is inserted into the work queue, we - * drop it from this list. - */ - struct list_head prepare_list; - atomic_t items; /* for delayed items */ - int nodes; /* for delayed nodes */ - wait_queue_head_t wait; -}; - -struct btrfs_delayed_node { - u64 inode_id; - u64 bytes_reserved; - struct btrfs_root *root; - /* Used to add the node into the delayed root's node list. */ - struct list_head n_list; - /* - * Used to add the node into the prepare list, the nodes in this list - * is waiting to be dealt with by the async worker. - */ - struct list_head p_list; - struct rb_root ins_root; - struct rb_root del_root; - struct mutex mutex; - struct btrfs_inode_item inode_item; - atomic_t refs; - u64 index_cnt; - bool in_list; - bool inode_dirty; - int count; -}; - -struct btrfs_delayed_item { - struct rb_node rb_node; - struct btrfs_key key; - struct list_head tree_list; /* used for batch insert/delete items */ - struct list_head readdir_list; /* used for readdir items */ - u64 bytes_reserved; - struct btrfs_delayed_node *delayed_node; - atomic_t refs; - int ins_or_del; - u32 data_len; - char data[0]; -}; - -static inline void btrfs_init_delayed_root( - struct btrfs_delayed_root *delayed_root) -{ - atomic_set(&delayed_root->items, 0); - delayed_root->nodes = 0; - spin_lock_init(&delayed_root->lock); - init_waitqueue_head(&delayed_root->wait); - INIT_LIST_HEAD(&delayed_root->node_list); - INIT_LIST_HEAD(&delayed_root->prepare_list); -} - -int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, - struct btrfs_disk_key *disk_key, u8 type, - u64 index); - -int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *dir, - u64 index); - -int btrfs_inode_delayed_dir_index_count(struct inode *inode); - -int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - -void btrfs_balance_delayed_items(struct btrfs_root *root); - -int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, - struct inode *inode); -/* Used for evicting the inode. */ -void btrfs_remove_delayed_node(struct inode *inode); -void btrfs_kill_delayed_inode_items(struct inode *inode); - - -int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); - -/* Used for drop dead root */ -void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); - -/* Used for readdir() */ -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list); -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list); -int btrfs_should_delete_dir_index(struct list_head *del_list, - u64 index); -int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, - filldir_t filldir, - struct list_head *ins_list); - -/* for init */ -int __init btrfs_delayed_inode_init(void); -void btrfs_delayed_inode_exit(void); - -/* for debugging */ -void btrfs_assert_delayed_root_empty(struct btrfs_root *root); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/delayed-ref.c b/ANDROID_3.4.5/fs/btrfs/delayed-ref.c deleted file mode 100644 index 69f22e3a..00000000 --- a/ANDROID_3.4.5/fs/btrfs/delayed-ref.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sort.h> -#include "ctree.h" -#include "delayed-ref.h" -#include "transaction.h" - -/* - * delayed back reference update tracking. For subvolume trees - * we queue up extent allocations and backref maintenance for - * delayed processing. This avoids deep call chains where we - * add extents in the middle of btrfs_search_slot, and it allows - * us to buffer up frequently modified backrefs in an rb tree instead - * of hammering updates on the extent allocation tree. - */ - -/* - * compare two delayed tree backrefs with same bytenr and type - */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1) -{ - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* - * compare two delayed data backrefs with same bytenr and type - */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, - struct btrfs_delayed_data_ref *ref1) -{ - if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->objectid < ref2->objectid) - return -1; - if (ref1->objectid > ref2->objectid) - return 1; - if (ref1->offset < ref2->offset) - return -1; - if (ref1->offset > ref2->offset) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - /* merging of sequenced refs is not allowed */ - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1)); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - -/* - * find an head entry based on bytenr. This returns the delayed ref - * head if it was able to find one, or NULL if nothing was in that spot. - * If return_bigger is given, the next bigger entry is returned if no exact - * match is found. - */ -static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, - u64 bytenr, - struct btrfs_delayed_ref_node **last, - int return_bigger) -{ - struct rb_node *n; - struct btrfs_delayed_ref_node *entry; - int cmp = 0; - -again: - n = root->rb_node; - entry = NULL; - while (n) { - entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); - WARN_ON(!entry->in_tree); - if (last) - *last = entry; - - if (bytenr < entry->bytenr) - cmp = -1; - else if (bytenr > entry->bytenr) - cmp = 1; - else if (!btrfs_delayed_ref_is_head(entry)) - cmp = 1; - else - cmp = 0; - - if (cmp < 0) - n = n->rb_left; - else if (cmp > 0) - n = n->rb_right; - else - return entry; - } - if (entry && return_bigger) { - if (cmp > 0) { - n = rb_next(&entry->rb_node); - if (!n) - n = rb_first(root); - entry = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - bytenr = entry->bytenr; - return_bigger = 0; - goto again; - } - return entry; - } - return NULL; -} - -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) -{ - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; - assert_spin_locked(&delayed_refs->lock); - if (mutex_trylock(&head->mutex)) - return 0; - - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - mutex_lock(&head->mutex); - spin_lock(&delayed_refs->lock); - if (!head->node.in_tree) { - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - return -EAGAIN; - } - btrfs_put_delayed_ref(&head->node); - return 0; -} - -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - u64 seq) -{ - struct seq_list *elem; - - assert_spin_locked(&delayed_refs->lock); - if (list_empty(&delayed_refs->seq_head)) - return 0; - - elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list); - if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n", - seq, elem->seq, delayed_refs); - return 1; - } - return 0; -} - -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 start) -{ - int count = 0; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - - delayed_refs = &trans->transaction->delayed_refs; - if (start == 0) { - node = rb_first(&delayed_refs->root); - } else { - ref = NULL; - find_ref_head(&delayed_refs->root, start + 1, &ref, 1); - if (ref) { - node = &ref->rb_node; - } else - node = rb_first(&delayed_refs->root); - } -again: - while (node && count < 32) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - if (list_empty(&head->cluster)) { - list_add_tail(&head->cluster, cluster); - delayed_refs->run_delayed_start = - head->node.bytenr; - count++; - - WARN_ON(delayed_refs->num_heads_ready == 0); - delayed_refs->num_heads_ready--; - } else if (count) { - /* the goal of the clustering is to find extents - * that are likely to end up in the same extent - * leaf on disk. So, we don't want them spread - * all over the tree. Stop now if we've hit - * a head that was already in use - */ - break; - } - } - node = rb_next(node); - } - if (count) { - return 0; - } else if (start) { - /* - * we've gone to the end of the rbtree without finding any - * clusters. start from the beginning and try again - */ - start = 0; - node = rb_first(&delayed_refs->root); - goto again; - } - return 1; -} - -/* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent - * - * This may free existing if the update cancels out whatever - * operation it was doing. - */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) -{ - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) { - rb_erase(&existing->rb_node, - &delayed_refs->root); - existing->in_tree = 0; - btrfs_put_delayed_ref(existing); - delayed_refs->num_entries--; - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; - } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; - } -} - -/* - * helper function to update the accounting in the head ref - * existing and update must have the same bytenr - */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) -{ - struct btrfs_delayed_ref_head *existing_ref; - struct btrfs_delayed_ref_head *ref; - - existing_ref = btrfs_delayed_node_to_head(existing); - ref = btrfs_delayed_node_to_head(update); - BUG_ON(existing_ref->is_data != ref->is_data); - - if (ref->must_insert_reserved) { - /* if the extent was freed and then - * reallocated before the delayed ref - * entries were processed, we can end up - * with an existing head ref without - * the must_insert_reserved flag set. - * Set it again here - */ - existing_ref->must_insert_reserved = ref->must_insert_reserved; - - /* - * update the num_bytes so we make sure the accounting - * is done correctly - */ - existing->num_bytes = update->num_bytes; - - } - - if (ref->extent_op) { - if (!existing_ref->extent_op) { - existing_ref->extent_op = ref->extent_op; - } else { - if (ref->extent_op->update_key) { - memcpy(&existing_ref->extent_op->key, - &ref->extent_op->key, - sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; - } - if (ref->extent_op->update_flags) { - existing_ref->extent_op->flags_to_set |= - ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; - } - kfree(ref->extent_op); - } - } - /* - * update the reference mod on the head to reflect this new operation - */ - existing->ref_mod += update->ref_mod; -} - -/* - * helper function to actually insert a head node into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count. - */ -static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, - int action, int is_data) -{ - struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_ref_head *head_ref = NULL; - struct btrfs_delayed_ref_root *delayed_refs; - int count_mod = 1; - int must_insert_reserved = 0; - - /* - * the head node stores the sum of all the mods, so dropping a ref - * should drop the sum in the head node by one. - */ - if (action == BTRFS_UPDATE_DELAYED_HEAD) - count_mod = 0; - else if (action == BTRFS_DROP_DELAYED_REF) - count_mod = -1; - - /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update - * the reserved accounting when the extent is finally added, or - * if a later modification deletes the delayed ref without ever - * inserting the extent into the extent allocation tree. - * ref->must_insert_reserved is the flag used to record - * that accounting mods are required. - * - * Once we record must_insert_reserved, switch the action to - * BTRFS_ADD_DELAYED_REF because other special casing is not required. - */ - if (action == BTRFS_ADD_DELAYED_EXTENT) - must_insert_reserved = 1; - else - must_insert_reserved = 0; - - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = count_mod; - ref->type = 0; - ref->action = 0; - ref->is_head = 1; - ref->in_tree = 1; - ref->seq = 0; - - head_ref = btrfs_delayed_node_to_head(ref); - head_ref->must_insert_reserved = must_insert_reserved; - head_ref->is_data = is_data; - - INIT_LIST_HEAD(&head_ref->cluster); - mutex_init(&head_ref->mutex); - - trace_btrfs_delayed_ref_head(ref, head_ref, action); - - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - - if (existing) { - update_existing_head_ref(existing, ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ - kfree(head_ref); - } else { - delayed_refs->num_heads++; - delayed_refs->num_heads_ready++; - delayed_refs->num_entries++; - trans->delayed_ref_updates++; - } -} - -/* - * helper to insert a delayed tree ref into the rbtree. - */ -static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - int for_cow) -{ - struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_tree_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; - u64 seq = 0; - - if (action == BTRFS_ADD_DELAYED_EXTENT) - action = BTRFS_ADD_DELAYED_REF; - - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = 1; - ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = inc_delayed_seq(delayed_refs); - ref->seq = seq; - - full_ref = btrfs_delayed_node_to_tree_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) - ref->type = BTRFS_SHARED_BLOCK_REF_KEY; - else - ref->type = BTRFS_TREE_BLOCK_REF_KEY; - full_ref->level = level; - - trace_btrfs_delayed_tree_ref(ref, full_ref, action); - - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - - if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ - kfree(full_ref); - } else { - delayed_refs->num_entries++; - trans->delayed_ref_updates++; - } -} - -/* - * helper to insert a delayed data ref into the rbtree. - */ -static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 owner, u64 offset, - int action, int for_cow) -{ - struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_data_ref *full_ref; - struct btrfs_delayed_ref_root *delayed_refs; - u64 seq = 0; - - if (action == BTRFS_ADD_DELAYED_EXTENT) - action = BTRFS_ADD_DELAYED_REF; - - delayed_refs = &trans->transaction->delayed_refs; - - /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = 1; - ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; - - if (need_ref_seq(for_cow, ref_root)) - seq = inc_delayed_seq(delayed_refs); - ref->seq = seq; - - full_ref = btrfs_delayed_node_to_data_ref(ref); - full_ref->parent = parent; - full_ref->root = ref_root; - if (parent) - ref->type = BTRFS_SHARED_DATA_REF_KEY; - else - ref->type = BTRFS_EXTENT_DATA_REF_KEY; - - full_ref->objectid = owner; - full_ref->offset = offset; - - trace_btrfs_delayed_data_ref(ref, full_ref, action); - - existing = tree_insert(&delayed_refs->root, &ref->rb_node); - - if (existing) { - update_existing_ref(trans, delayed_refs, existing, ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ - kfree(full_ref); - } else { - delayed_refs->num_entries++; - trans->delayed_ref_updates++; - } -} - -/* - * add a delayed tree ref. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - */ -int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow) -{ - struct btrfs_delayed_tree_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - - BUG_ON(extent_op && extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) { - kfree(ref); - return -ENOMEM; - } - - head_ref->extent_op = extent_op; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 0); - - add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action, - for_cow); - if (!need_ref_seq(for_cow, ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); - return 0; -} - -/* - * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. - */ -int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow) -{ - struct btrfs_delayed_data_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - - BUG_ON(extent_op && !extent_op->is_data); - ref = kmalloc(sizeof(*ref), GFP_NOFS); - if (!ref) - return -ENOMEM; - - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) { - kfree(ref); - return -ENOMEM; - } - - head_ref->extent_op = extent_op; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, action, 1); - - add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, - num_bytes, parent, ref_root, owner, offset, - action, for_cow); - if (!need_ref_seq(for_cow, ref_root) && - waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); - return 0; -} - -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - - head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); - if (!head_ref) - return -ENOMEM; - - head_ref->extent_op = extent_op; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); - - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); - return 0; -} - -/* - * this does a simple search for the head node for a given extent. - * It must be called with the delayed ref spinlock held, and it returns - * the head node if any where found, or NULL if not. - */ -struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_root *delayed_refs; - - delayed_refs = &trans->transaction->delayed_refs; - ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); - if (ref) - return btrfs_delayed_node_to_head(ref); - return NULL; -} diff --git a/ANDROID_3.4.5/fs/btrfs/delayed-ref.h b/ANDROID_3.4.5/fs/btrfs/delayed-ref.h deleted file mode 100644 index d8f244d9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/delayed-ref.h +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#ifndef __DELAYED_REF__ -#define __DELAYED_REF__ - -/* these are the possible values of struct btrfs_delayed_ref->action */ -#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ -#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ -#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ - -struct btrfs_delayed_ref_node { - struct rb_node rb_node; - - /* the starting bytenr of the extent */ - u64 bytenr; - - /* the size of the extent */ - u64 num_bytes; - - /* seq number to keep track of insertion order */ - u64 seq; - - /* ref count on this data structure */ - atomic_t refs; - - /* - * how many refs is this entry adding or deleting. For - * head refs, this may be a negative number because it is keeping - * track of the total mods done to the reference count. - * For individual refs, this will always be a positive number - * - * It may be more than one, since it is possible for a single - * parent to have more than one ref on an extent - */ - int ref_mod; - - unsigned int action:8; - unsigned int type:8; - /* is this node still in the rbtree? */ - unsigned int is_head:1; - unsigned int in_tree:1; -}; - -struct btrfs_delayed_extent_op { - struct btrfs_disk_key key; - u64 flags_to_set; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; -}; - -/* - * the head refs are used to hold a lock on a given extent, which allows us - * to make sure that only one process is running the delayed refs - * at a time for a single extent. They also store the sum of all the - * reference count modifications we've queued up. - */ -struct btrfs_delayed_ref_head { - struct btrfs_delayed_ref_node node; - - /* - * the mutex is held while running the refs, and it is also - * held when checking the sum of reference modifications. - */ - struct mutex mutex; - - struct list_head cluster; - - struct btrfs_delayed_extent_op *extent_op; - /* - * when a new extent is allocated, it is just reserved in memory - * The actual extent isn't inserted into the extent allocation tree - * until the delayed ref is processed. must_insert_reserved is - * used to flag a delayed ref so the accounting can be updated - * when a full insert is done. - * - * It is possible the extent will be freed before it is ever - * inserted into the extent allocation tree. In this case - * we need to update the in ram accounting to properly reflect - * the free has happened. - */ - unsigned int must_insert_reserved:1; - unsigned int is_data:1; -}; - -struct btrfs_delayed_tree_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - int level; -}; - -struct btrfs_delayed_data_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - u64 objectid; - u64 offset; -}; - -struct btrfs_delayed_ref_root { - struct rb_root root; - - /* this spin lock protects the rbtree and the entries inside */ - spinlock_t lock; - - /* how many delayed ref updates we've queued, used by the - * throttling code - */ - unsigned long num_entries; - - /* total number of head nodes in tree */ - unsigned long num_heads; - - /* total number of head nodes ready for processing */ - unsigned long num_heads_ready; - - /* - * set when the tree is flushing before a transaction commit, - * used by the throttling code to decide if new updates need - * to be run right away - */ - int flushing; - - u64 run_delayed_start; - - /* - * seq number of delayed refs. We need to know if a backref was being - * added before the currently processed ref or afterwards. - */ - u64 seq; - - /* - * seq_list holds a list of all seq numbers that are currently being - * added to the list. While walking backrefs (btrfs_find_all_roots, - * qgroups), which might take some time, no newer ref must be processed, - * as it might influence the outcome of the walk. - */ - struct list_head seq_head; - - /* - * when the only refs we have in the list must not be processed, we want - * to wait for more refs to show up or for the end of backref walking. - */ - wait_queue_head_t seq_wait; -}; - -static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) -{ - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { - WARN_ON(ref->in_tree); - kfree(ref); - } -} - -int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow); -int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int for_cow); -int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, - struct btrfs_delayed_extent_op *extent_op); - -struct btrfs_delayed_ref_head * -btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head); -int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, - struct list_head *cluster, u64 search_start); - -struct seq_list { - struct list_head list; - u64 seq; -}; - -static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs) -{ - assert_spin_locked(&delayed_refs->lock); - ++delayed_refs->seq; - return delayed_refs->seq; -} - -static inline void -btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - assert_spin_locked(&delayed_refs->lock); - elem->seq = delayed_refs->seq; - list_add_tail(&elem->list, &delayed_refs->seq_head); -} - -static inline void -btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - struct seq_list *elem) -{ - spin_lock(&delayed_refs->lock); - list_del(&elem->list); - wake_up(&delayed_refs->seq_wait); - spin_unlock(&delayed_refs->lock); -} - -int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs, - u64 seq); - -/* - * delayed refs with a ref_seq > 0 must be held back during backref walking. - * this only applies to items in one of the fs-trees. for_cow items never need - * to be held back, so they won't get a ref_seq number. - */ -static inline int need_ref_seq(int for_cow, u64 rootid) -{ - if (for_cow) - return 0; - - if (rootid == BTRFS_FS_TREE_OBJECTID) - return 1; - - if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) - return 1; - - return 0; -} - -/* - * a node might live in a head or a regular ref, this lets you - * test for the proper type to use. - */ -static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) -{ - return node->is_head; -} - -/* - * helper functions to cast a node into its container - */ -static inline struct btrfs_delayed_tree_ref * -btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_tree_ref, node); -} - -static inline struct btrfs_delayed_data_ref * -btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_data_ref, node); -} - -static inline struct btrfs_delayed_ref_head * -btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(!btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref_head, node); -} -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/dir-item.c b/ANDROID_3.4.5/fs/btrfs/dir-item.c deleted file mode 100644 index c1a074d0..00000000 --- a/ANDROID_3.4.5/fs/btrfs/dir-item.c +++ /dev/null @@ -1,422 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "ctree.h" -#include "disk-io.h" -#include "hash.h" -#include "transaction.h" - -/* - * insert a name into a directory, doing overflow properly if there is a hash - * collision. data_size indicates how big the item inserted should be. On - * success a struct btrfs_dir_item pointer is returned, otherwise it is - * an ERR_PTR. - * - * The name is not copied into the dir item, you have to do that yourself. - */ -static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle - *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *cpu_key, - u32 data_size, - const char *name, - int name_len) -{ - int ret; - char *ptr; - struct btrfs_item *item; - struct extent_buffer *leaf; - - ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); - if (ret == -EEXIST) { - struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(root, path, name, name_len); - if (di) - return ERR_PTR(-EEXIST); - btrfs_extend_item(trans, root, path, data_size); - } else if (ret < 0) - return ERR_PTR(ret); - WARN_ON(ret > 0); - leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr(leaf, path->slots[0], char); - BUG_ON(data_size > btrfs_item_size(leaf, item)); - ptr += btrfs_item_size(leaf, item) - data_size; - return (struct btrfs_dir_item *)ptr; -} - -/* - * xattrs work a lot like directories, this inserts an xattr item - * into the tree - */ -int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - const char *name, u16 name_len, - const void *data, u16 data_len) -{ - int ret = 0; - struct btrfs_dir_item *dir_item; - unsigned long name_ptr, data_ptr; - struct btrfs_key key, location; - struct btrfs_disk_key disk_key; - struct extent_buffer *leaf; - u32 data_size; - - BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); - key.offset = btrfs_name_hash(name, name_len); - - data_size = sizeof(*dir_item) + name_len + data_len; - dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); - if (IS_ERR(dir_item)) - return PTR_ERR(dir_item); - memset(&location, 0, sizeof(location)); - - leaf = path->nodes[0]; - btrfs_cpu_key_to_disk(&disk_key, &location); - btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); - btrfs_set_dir_name_len(leaf, dir_item, name_len); - btrfs_set_dir_transid(leaf, dir_item, trans->transid); - btrfs_set_dir_data_len(leaf, dir_item, data_len); - name_ptr = (unsigned long)(dir_item + 1); - data_ptr = (unsigned long)((char *)name_ptr + name_len); - - write_extent_buffer(leaf, name, name_ptr, name_len); - write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(path->nodes[0]); - - return ret; -} - -/* - * insert a directory item in the tree, doing all the magic for - * both indexes. 'dir' indicates which objectid to insert it into, - * 'location' is the key to stuff into the directory item, 'type' is the - * type of the inode we're pointing to, and 'index' is the sequence number - * to use for the second index (if one is created). - * Will return 0 or -ENOMEM - */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root - *root, const char *name, int name_len, - struct inode *dir, struct btrfs_key *location, - u8 type, u64 index) -{ - int ret = 0; - int ret2 = 0; - struct btrfs_path *path; - struct btrfs_dir_item *dir_item; - struct extent_buffer *leaf; - unsigned long name_ptr; - struct btrfs_key key; - struct btrfs_disk_key disk_key; - u32 data_size; - - key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); - key.offset = btrfs_name_hash(name, name_len); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - btrfs_cpu_key_to_disk(&disk_key, location); - - data_size = sizeof(*dir_item) + name_len; - dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); - if (IS_ERR(dir_item)) { - ret = PTR_ERR(dir_item); - if (ret == -EEXIST) - goto second_insert; - goto out_free; - } - - leaf = path->nodes[0]; - btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, type); - btrfs_set_dir_data_len(leaf, dir_item, 0); - btrfs_set_dir_name_len(leaf, dir_item, name_len); - btrfs_set_dir_transid(leaf, dir_item, trans->transid); - name_ptr = (unsigned long)(dir_item + 1); - - write_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_mark_buffer_dirty(leaf); - -second_insert: - /* FIXME, use some real flag for selecting the extra index */ - if (root == root->fs_info->tree_root) { - ret = 0; - goto out_free; - } - btrfs_release_path(path); - - ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir, - &disk_key, type, index); -out_free: - btrfs_free_path(path); - if (ret) - return ret; - if (ret2) - return ret2; - return 0; -} - -/* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) - */ -struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, int name_len, - int mod) -{ - int ret; - struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); - - key.offset = btrfs_name_hash(name, name_len); - - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return NULL; - - return btrfs_match_dir_item_name(root, path, name, name_len); -} - -/* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) - * - * The name is used to make sure the index really points to the name you were - * looking for. - */ -struct btrfs_dir_item * -btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, - int mod) -{ - int ret; - struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); - key.offset = objectid; - - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root, path, name, name_len); -} - -struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len) -{ - struct extent_buffer *leaf; - struct btrfs_dir_item *di; - struct btrfs_key key; - u32 nritems; - int ret; - - key.objectid = dirid; - key.type = BTRFS_DIR_INDEX_KEY; - key.offset = 0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) - break; - - di = btrfs_match_dir_item_name(root, path, name, name_len); - if (di) - return di; - - path->slots[0]++; - } - return NULL; -} - -struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, u16 name_len, - int mod) -{ - int ret; - struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); - key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return NULL; - - return btrfs_match_dir_item_name(root, path, name, name_len); -} - -/* - * helper function to look at the directory item pointed to by 'path' - * this walks through all the entries in a dir item and finds one - * for a specific name. - */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len) -{ - struct btrfs_dir_item *dir_item; - unsigned long name_ptr; - u32 total_len; - u32 cur = 0; - u32 this_len; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - if (verify_dir_item(root, leaf, dir_item)) - return NULL; - - total_len = btrfs_item_size_nr(leaf, path->slots[0]); - while (cur < total_len) { - this_len = sizeof(*dir_item) + - btrfs_dir_name_len(leaf, dir_item) + - btrfs_dir_data_len(leaf, dir_item); - name_ptr = (unsigned long)(dir_item + 1); - - if (btrfs_dir_name_len(leaf, dir_item) == name_len && - memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) - return dir_item; - - cur += this_len; - dir_item = (struct btrfs_dir_item *)((char *)dir_item + - this_len); - } - return NULL; -} - -/* - * given a pointer into a directory item, delete it. This - * handles items that have more than one entry in them. - */ -int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_dir_item *di) -{ - - struct extent_buffer *leaf; - u32 sub_item_len; - u32 item_len; - int ret = 0; - - leaf = path->nodes[0]; - sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + - btrfs_dir_data_len(leaf, di); - item_len = btrfs_item_size_nr(leaf, path->slots[0]); - if (sub_item_len == item_len) { - ret = btrfs_del_item(trans, root, path); - } else { - /* MARKER */ - unsigned long ptr = (unsigned long)di; - unsigned long start; - - start = btrfs_item_ptr_offset(leaf, path->slots[0]); - memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, - item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); - } - return ret; -} - -int verify_dir_item(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_dir_item *dir_item) -{ - u16 namelen = BTRFS_NAME_LEN; - u8 type = btrfs_dir_type(leaf, dir_item); - - if (type >= BTRFS_FT_MAX) { - printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", - (int)type); - return 1; - } - - if (type == BTRFS_FT_XATTR) - namelen = XATTR_NAME_MAX; - - if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n", - (unsigned)btrfs_dir_data_len(leaf, dir_item)); - return 1; - } - - /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ - if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { - printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", - (unsigned)btrfs_dir_data_len(leaf, dir_item)); - return 1; - } - - return 0; -} diff --git a/ANDROID_3.4.5/fs/btrfs/disk-io.c b/ANDROID_3.4.5/fs/btrfs/disk-io.c deleted file mode 100644 index a7ffc88a..00000000 --- a/ANDROID_3.4.5/fs/btrfs/disk-io.c +++ /dev/null @@ -1,3693 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/fs.h> -#include <linux/blkdev.h> -#include <linux/scatterlist.h> -#include <linux/swap.h> -#include <linux/radix-tree.h> -#include <linux/writeback.h> -#include <linux/buffer_head.h> -#include <linux/workqueue.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/crc32c.h> -#include <linux/slab.h> -#include <linux/migrate.h> -#include <linux/ratelimit.h> -#include <asm/unaligned.h> -#include "compat.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "volumes.h" -#include "print-tree.h" -#include "async-thread.h" -#include "locking.h" -#include "tree-log.h" -#include "free-space-cache.h" -#include "inode-map.h" -#include "check-integrity.h" - -static struct extent_io_ops btree_extent_io_ops; -static void end_workqueue_fn(struct btrfs_work *work); -static void free_fs_root(struct btrfs_root *root); -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, - int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_root *root); -static void btrfs_destroy_ordered_extents(struct btrfs_root *root); -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root); -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); -static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); -static int btrfs_destroy_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, - int mark); -static int btrfs_destroy_pinned_extent(struct btrfs_root *root, - struct extent_io_tree *pinned_extents); - -/* - * end_io_wq structs are used to do processing in task context when an IO is - * complete. This is used during reads to verify checksums, and it is used - * by writes to insert metadata for new file extents after IO is complete. - */ -struct end_io_wq { - struct bio *bio; - bio_end_io_t *end_io; - void *private; - struct btrfs_fs_info *info; - int error; - int metadata; - struct list_head list; - struct btrfs_work work; -}; - -/* - * async submit bios are used to offload expensive checksumming - * onto the worker threads. They checksum file and metadata bios - * just before they are sent down the IO stack. - */ -struct async_submit_bio { - struct inode *inode; - struct bio *bio; - struct list_head list; - extent_submit_bio_hook_t *submit_bio_start; - extent_submit_bio_hook_t *submit_bio_done; - int rw; - int mirror_num; - unsigned long bio_flags; - /* - * bio_offset is optional, can be used if the pages in the bio - * can't tell us where in the file the bio should go - */ - u64 bio_offset; - struct btrfs_work work; - int error; -}; - -/* - * Lockdep class keys for extent_buffer->lock's in this root. For a given - * eb, the lockdep key is determined by the btrfs_root it belongs to and - * the level the eb occupies in the tree. - * - * Different roots are used for different purposes and may nest inside each - * other and they require separate keysets. As lockdep keys should be - * static, assign keysets according to the purpose of the root as indicated - * by btrfs_root->objectid. This ensures that all special purpose roots - * have separate keysets. - * - * Lock-nesting across peer nodes is always done with the immediate parent - * node locked thus preventing deadlock. As lockdep doesn't know this, use - * subclass to avoid triggering lockdep warning in such cases. - * - * The key is set by the readpage_end_io_hook after the buffer has passed - * csum validation but before the pages are unlocked. It is also set by - * btrfs_init_new_buffer on freshly allocated blocks. - * - * We also add a check to make sure the highest level of the tree is the - * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code - * needs update as well. - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# if BTRFS_MAX_LEVEL != 8 -# error -# endif - -static struct btrfs_lockdep_keyset { - u64 id; /* root objectid */ - const char *name_stem; /* lock name stem */ - char names[BTRFS_MAX_LEVEL + 1][20]; - struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; -} btrfs_lockdep_keysets[] = { - { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, - { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, - { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, - { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, - { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, - { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, - { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" }, - { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, - { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, - { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, - { .id = 0, .name_stem = "tree" }, -}; - -void __init btrfs_init_lockdep(void) -{ - int i, j; - - /* initialize lockdep class names */ - for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { - struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; - - for (j = 0; j < ARRAY_SIZE(ks->names); j++) - snprintf(ks->names[j], sizeof(ks->names[j]), - "btrfs-%s-%02d", ks->name_stem, j); - } -} - -void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, - int level) -{ - struct btrfs_lockdep_keyset *ks; - - BUG_ON(level >= ARRAY_SIZE(ks->keys)); - - /* find the matching keyset, id 0 is the default entry */ - for (ks = btrfs_lockdep_keysets; ks->id; ks++) - if (ks->id == objectid) - break; - - lockdep_set_class_and_name(&eb->lock, - &ks->keys[level], ks->names[level]); -} - -#endif - -/* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -static struct extent_map *btree_get_extent(struct inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - em->bdev = - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (ret == -EEXIST) { - u64 failed_start = em->start; - u64 failed_len = em->len; - - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - ret = 0; - } else { - em = lookup_extent_mapping(em_tree, failed_start, - failed_len); - ret = -EIO; - } - } else if (ret) { - free_extent_map(em); - em = NULL; - } - write_unlock(&em_tree->lock); - - if (ret) - em = ERR_PTR(ret); -out: - return em; -} - -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) -{ - return crc32c(seed, data, len); -} - -void btrfs_csum_final(u32 crc, char *result) -{ - put_unaligned_le32(~crc, result); -} - -/* - * compute the csum for a btree block, and either verify it or write it - * into the csum field of the block. - */ -static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, - int verify) -{ - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - char *result = NULL; - unsigned long len; - unsigned long cur_len; - unsigned long offset = BTRFS_CSUM_SIZE; - char *kaddr; - unsigned long map_start; - unsigned long map_len; - int err; - u32 crc = ~(u32)0; - unsigned long inline_result; - - len = buf->len - offset; - while (len > 0) { - err = map_private_extent_buffer(buf, offset, 32, - &kaddr, &map_start, &map_len); - if (err) - return 1; - cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(root, kaddr + offset - map_start, - crc, cur_len); - len -= cur_len; - offset += cur_len; - } - if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size * sizeof(char), GFP_NOFS); - if (!result) - return 1; - } else { - result = (char *)&inline_result; - } - - btrfs_csum_final(crc, result); - - if (verify) { - if (memcmp_extent_buffer(buf, result, 0, csum_size)) { - u32 val; - u32 found = 0; - memcpy(&found, result, csum_size); - - read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " - "failed on %llu wanted %X found %X " - "level %d\n", - root->fs_info->sb->s_id, - (unsigned long long)buf->start, val, found, - btrfs_header_level(buf)); - if (result != (char *)&inline_result) - kfree(result); - return 1; - } - } else { - write_extent_buffer(buf, result, 0, csum_size); - } - if (result != (char *)&inline_result) - kfree(result); - return 0; -} - -/* - * we can't consider a given block up to date unless the transid of the - * block matches the transid in the parent node's pointer. This is how we - * detect blocks that either didn't get written at all or got written - * in the wrong place. - */ -static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid, - int atomic) -{ - struct extent_state *cached_state = NULL; - int ret; - - if (!parent_transid || btrfs_header_generation(eb) == parent_transid) - return 0; - - if (atomic) - return -EAGAIN; - - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state); - if (extent_buffer_uptodate(eb) && - btrfs_header_generation(eb) == parent_transid) { - ret = 0; - goto out; - } - printk_ratelimited("parent transid verify failed on %llu wanted %llu " - "found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); - ret = 1; - clear_extent_buffer_uptodate(eb); -out: - unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state, GFP_NOFS); - return ret; -} - -/* - * helper to read a given tree block, doing retries as required when - * the checksums don't match and we have alternate mirrors to try. - */ -static int btree_read_extent_buffer_pages(struct btrfs_root *root, - struct extent_buffer *eb, - u64 start, u64 parent_transid) -{ - struct extent_io_tree *io_tree; - int failed = 0; - int ret; - int num_copies = 0; - int mirror_num = 0; - int failed_mirror = 0; - - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; - while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, - WAIT_COMPLETE, - btree_get_extent, mirror_num); - if (!ret && !verify_parent_transid(io_tree, eb, - parent_transid, 0)) - break; - - /* - * This buffer's crc is fine, but its contents are corrupted, so - * there is no reason to read the other copies, they won't be - * any less wrong. - */ - if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) - break; - - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, - eb->start, eb->len); - if (num_copies == 1) - break; - - if (!failed_mirror) { - failed = 1; - failed_mirror = eb->read_mirror; - } - - mirror_num++; - if (mirror_num == failed_mirror) - mirror_num++; - - if (mirror_num > num_copies) - break; - } - - if (failed && !ret) - repair_eb_io_failure(root, eb, failed_mirror); - - return ret; -} - -/* - * checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block - */ - -static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) -{ - struct extent_io_tree *tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 found_start; - struct extent_buffer *eb; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - - eb = (struct extent_buffer *)page->private; - if (page != eb->pages[0]) - return 0; - found_start = btrfs_header_bytenr(eb); - if (found_start != start) { - WARN_ON(1); - return 0; - } - if (eb->pages[0] != page) { - WARN_ON(1); - return 0; - } - if (!PageUptodate(page)) { - WARN_ON(1); - return 0; - } - csum_tree_block(root, eb, 0); - return 0; -} - -static int check_tree_block_fsid(struct btrfs_root *root, - struct extent_buffer *eb) -{ - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - u8 fsid[BTRFS_UUID_SIZE]; - int ret = 1; - - read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), - BTRFS_FSID_SIZE); - while (fs_devices) { - if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; -} - -#define CORRUPT(reason, eb, root, slot) \ - printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ - "root=%llu, slot=%d\n", reason, \ - (unsigned long long)btrfs_header_bytenr(eb), \ - (unsigned long long)root->objectid, slot) - -static noinline int check_leaf(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_key key; - struct btrfs_key leaf_key; - u32 nritems = btrfs_header_nritems(leaf); - int slot; - - if (nritems == 0) - return 0; - - /* Check the 0 item */ - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(root)) { - CORRUPT("invalid item offset size pair", leaf, root, 0); - return -EIO; - } - - /* - * Check to make sure each items keys are in the correct order and their - * offsets make sense. We only have to loop through nritems-1 because - * we check the current slot against the next slot, which verifies the - * next slot's offset+size makes sense and that the current's slot - * offset is correct. - */ - for (slot = 0; slot < nritems - 1; slot++) { - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &key, slot + 1); - - /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { - CORRUPT("bad key order", leaf, root, slot); - return -EIO; - } - - /* - * Make sure the offset and ends are right, remember that the - * item data starts at the end of the leaf and grows towards the - * front. - */ - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - CORRUPT("slot offset bad", leaf, root, slot); - return -EIO; - } - - /* - * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but - * all point outside of the leaf. - */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(root)) { - CORRUPT("slot end outside of leaf", leaf, root, slot); - return -EIO; - } - } - - return 0; -} - -struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, - struct page *page, int max_walk) -{ - struct extent_buffer *eb; - u64 start = page_offset(page); - u64 target = start; - u64 min_start; - - if (start < max_walk) - min_start = 0; - else - min_start = start - max_walk; - - while (start >= min_start) { - eb = find_extent_buffer(tree, start, 0); - if (eb) { - /* - * we found an extent buffer and it contains our page - * horray! - */ - if (eb->start <= target && - eb->start + eb->len > target) - return eb; - - /* we found an extent buffer that wasn't for us */ - free_extent_buffer(eb); - return NULL; - } - if (start == 0) - break; - start -= PAGE_CACHE_SIZE; - } - return NULL; -} - -static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror) -{ - struct extent_io_tree *tree; - u64 found_start; - int found_level; - struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - int ret = 0; - int reads_done; - - if (!page->private) - goto out; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - eb = (struct extent_buffer *)page->private; - - /* the pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - extent_buffer_get(eb); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - - found_start = btrfs_header_bytenr(eb); - if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "btrfs bad tree block start " - "%llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); - ret = -EIO; - goto err; - } - if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); - ret = -EIO; - goto err; - } - found_level = btrfs_header_level(eb); - - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), - eb, found_level); - - ret = csum_tree_block(root, eb, 1); - if (ret) { - ret = -EIO; - goto err; - } - - /* - * If this is a leaf block and it is corrupt, set the corrupt bit so - * that we don't try and read the other copies of this block, just - * return -EIO. - */ - if (found_level == 0 && check_leaf(root, eb)) { - set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = -EIO; - } - - if (!ret) - set_extent_buffer_uptodate(eb); -err: - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); - btree_readahead_hook(root, eb, eb->start, ret); - } - - if (ret) - clear_extent_buffer_uptodate(eb); - free_extent_buffer(eb); -out: - return ret; -} - -static int btree_io_failed_hook(struct page *page, int failed_mirror) -{ - struct extent_buffer *eb; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - - eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->read_mirror = failed_mirror; - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(root, eb, eb->start, -EIO); - return -EIO; /* we fixed nothing */ -} - -static void end_workqueue_bio(struct bio *bio, int err) -{ - struct end_io_wq *end_io_wq = bio->bi_private; - struct btrfs_fs_info *fs_info; - - fs_info = end_io_wq->info; - end_io_wq->error = err; - end_io_wq->work.func = end_workqueue_fn; - end_io_wq->work.flags = 0; - - if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == 1) - btrfs_queue_worker(&fs_info->endio_meta_write_workers, - &end_io_wq->work); - else if (end_io_wq->metadata == 2) - btrfs_queue_worker(&fs_info->endio_freespace_worker, - &end_io_wq->work); - else - btrfs_queue_worker(&fs_info->endio_write_workers, - &end_io_wq->work); - } else { - if (end_io_wq->metadata) - btrfs_queue_worker(&fs_info->endio_meta_workers, - &end_io_wq->work); - else - btrfs_queue_worker(&fs_info->endio_workers, - &end_io_wq->work); - } -} - -/* - * For the metadata arg you want - * - * 0 - if data - * 1 - if normal metadta - * 2 - if writing to the free space cache area - */ -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata) -{ - struct end_io_wq *end_io_wq; - end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); - if (!end_io_wq) - return -ENOMEM; - - end_io_wq->private = bio->bi_private; - end_io_wq->end_io = bio->bi_end_io; - end_io_wq->info = info; - end_io_wq->error = 0; - end_io_wq->bio = bio; - end_io_wq->metadata = metadata; - - bio->bi_private = end_io_wq; - bio->bi_end_io = end_workqueue_bio; - return 0; -} - -unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) -{ - unsigned long limit = min_t(unsigned long, - info->workers.max_workers, - info->fs_devices->open_devices); - return 256 * limit; -} - -static void run_one_async_start(struct btrfs_work *work) -{ - struct async_submit_bio *async; - int ret; - - async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); - if (ret) - async->error = ret; -} - -static void run_one_async_done(struct btrfs_work *work) -{ - struct btrfs_fs_info *fs_info; - struct async_submit_bio *async; - int limit; - - async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; - - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; - - atomic_dec(&fs_info->nr_async_submits); - - if (atomic_read(&fs_info->nr_async_submits) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); - - /* If an error occured we just want to clean up the bio and move on */ - if (async->error) { - bio_endio(async->bio, async->error); - return; - } - - async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); -} - -static void run_one_async_free(struct btrfs_work *work) -{ - struct async_submit_bio *async; - - async = container_of(work, struct async_submit_bio, work); - kfree(async); -} - -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) -{ - struct async_submit_bio *async; - - async = kmalloc(sizeof(*async), GFP_NOFS); - if (!async) - return -ENOMEM; - - async->inode = inode; - async->rw = rw; - async->bio = bio; - async->mirror_num = mirror_num; - async->submit_bio_start = submit_bio_start; - async->submit_bio_done = submit_bio_done; - - async->work.func = run_one_async_start; - async->work.ordered_func = run_one_async_done; - async->work.ordered_free = run_one_async_free; - - async->work.flags = 0; - async->bio_flags = bio_flags; - async->bio_offset = bio_offset; - - async->error = 0; - - atomic_inc(&fs_info->nr_async_submits); - - if (rw & REQ_SYNC) - btrfs_set_work_high_prio(&async->work); - - btrfs_queue_worker(&fs_info->workers, &async->work); - - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->nr_async_submits)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0)); - } - - return 0; -} - -static int btree_csum_one_bio(struct bio *bio) -{ - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; - struct btrfs_root *root; - int ret = 0; - - WARN_ON(bio->bi_vcnt <= 0); - while (bio_index < bio->bi_vcnt) { - root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root, bvec->bv_page); - if (ret) - break; - bio_index++; - bvec++; - } - return ret; -} - -static int __btree_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio - */ - return btree_csum_one_bio(bio); -} - -static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) -{ - /* - * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio - */ - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); -} - -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) -{ - int ret; - - if (!(rw & REQ_WRITE)) { - - /* - * called for a read, do the setup so that checksum validation - * can happen in the async kernel threads - */ - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); - if (ret) - return ret; - return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); - } - - /* - * kthread helpers are used to submit writes so that checksumming - * can happen in parallel across all CPUs - */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, - bio_offset, - __btree_submit_bio_start, - __btree_submit_bio_done); -} - -#ifdef CONFIG_MIGRATION -static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) -{ - /* - * we can't safely write a btree page from here, - * we haven't done the locking hook - */ - if (PageDirty(page)) - return -EAGAIN; - /* - * Buffers may be managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) - return -EAGAIN; - return migrate_page(mapping, newpage, page, mode); -} -#endif - - -static int btree_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - tree = &BTRFS_I(mapping->host)->io_tree; - if (wbc->sync_mode == WB_SYNC_NONE) { - struct btrfs_root *root = BTRFS_I(mapping->host)->root; - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (wbc->for_kupdate) - return 0; - - /* this is a bit racy, but that's ok */ - num_dirty = root->fs_info->dirty_metadata_bytes; - if (num_dirty < thresh) - return 0; - } - return btree_write_cache_pages(mapping, wbc); -} - -static int btree_readpage(struct file *file, struct page *page) -{ - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent, 0); -} - -static int btree_releasepage(struct page *page, gfp_t gfp_flags) -{ - if (PageWriteback(page) || PageDirty(page)) - return 0; - /* - * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing - * slab allocation from alloc_extent_state down the callchain where - * it'd hit a BUG_ON as those flags are not allowed. - */ - gfp_flags &= ~GFP_SLAB_BUG_MASK; - - return try_release_extent_buffer(page, gfp_flags); -} - -static void btree_invalidatepage(struct page *page, unsigned long offset) -{ - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - extent_invalidatepage(tree, page, offset); - btree_releasepage(page, GFP_NOFS); - if (PagePrivate(page)) { - printk(KERN_WARNING "btrfs warning page private not zero " - "on page %llu\n", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } -} - -static int btree_set_page_dirty(struct page *page) -{ - struct extent_buffer *eb; - - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); - return __set_page_dirty_nobuffers(page); -} - -static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, - .writepages = btree_writepages, - .releasepage = btree_releasepage, - .invalidatepage = btree_invalidatepage, -#ifdef CONFIG_MIGRATION - .migratepage = btree_migratepage, -#endif - .set_page_dirty = btree_set_page_dirty, -}; - -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid) -{ - struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; - int ret = 0; - - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); - if (!buf) - return 0; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, WAIT_NONE, btree_get_extent, 0); - free_extent_buffer(buf); - return ret; -} - -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, - int mirror_num, struct extent_buffer **eb) -{ - struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; - int ret; - - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); - if (!buf) - return 0; - - set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - - ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, - btree_get_extent, mirror_num); - if (ret) { - free_extent_buffer(buf); - return ret; - } - - if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer(buf); - return -EIO; - } else if (extent_buffer_uptodate(buf)) { - *eb = buf; - } else { - free_extent_buffer(buf); - } - return 0; -} - -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) -{ - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); - return eb; -} - -struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) -{ - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - - eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize); - return eb; -} - - -int btrfs_write_tree_block(struct extent_buffer *buf) -{ - return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, - buf->start + buf->len - 1); -} - -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) -{ - return filemap_fdatawait_range(buf->pages[0]->mapping, - buf->start, buf->start + buf->len - 1); -} - -struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid) -{ - struct extent_buffer *buf = NULL; - int ret; - - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); - if (!buf) - return NULL; - - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); - return buf; - -} - -void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf) -{ - if (btrfs_header_generation(buf) == - root->fs_info->running_transaction->transid) { - btrfs_assert_tree_locked(buf); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= buf->len) - root->fs_info->dirty_metadata_bytes -= buf->len; - else { - spin_unlock(&root->fs_info->delalloc_lock); - btrfs_panic(root->fs_info, -EOVERFLOW, - "Can't clear %lu bytes from " - " dirty_mdatadata_bytes (%lu)", - buf->len, - root->fs_info->dirty_metadata_bytes); - } - spin_unlock(&root->fs_info->delalloc_lock); - } - - /* ugh, clear_extent_buffer_dirty needs to lock the page */ - btrfs_set_lock_blocking(buf); - clear_extent_buffer_dirty(buf); - } -} - -static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, - u64 objectid) -{ - root->node = NULL; - root->commit_root = NULL; - root->sectorsize = sectorsize; - root->nodesize = nodesize; - root->leafsize = leafsize; - root->stripesize = stripesize; - root->ref_cows = 0; - root->track_dirty = 0; - root->in_radix = 0; - root->orphan_item_inserted = 0; - root->orphan_cleanup_state = 0; - - root->objectid = objectid; - root->last_trans = 0; - root->highest_objectid = 0; - root->name = NULL; - root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); - root->block_rsv = NULL; - root->orphan_block_rsv = NULL; - - INIT_LIST_HEAD(&root->dirty_list); - INIT_LIST_HEAD(&root->orphan_list); - INIT_LIST_HEAD(&root->root_list); - spin_lock_init(&root->orphan_lock); - spin_lock_init(&root->inode_lock); - spin_lock_init(&root->accounting_lock); - mutex_init(&root->objectid_mutex); - mutex_init(&root->log_mutex); - init_waitqueue_head(&root->log_writer_wait); - init_waitqueue_head(&root->log_commit_wait[0]); - init_waitqueue_head(&root->log_commit_wait[1]); - atomic_set(&root->log_commit[0], 0); - atomic_set(&root->log_commit[1], 0); - atomic_set(&root->log_writers, 0); - root->log_batch = 0; - root->log_transid = 0; - root->last_log_commit = 0; - extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping); - - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - memset(&root->root_kobj, 0, sizeof(root->root_kobj)); - root->defrag_trans_start = fs_info->generation; - init_completion(&root->kobj_unregister); - root->defrag_running = 0; - root->root_key.objectid = objectid; - root->anon_dev = 0; -} - -static int __must_check find_and_setup_root(struct btrfs_root *tree_root, - struct btrfs_fs_info *fs_info, - u64 objectid, - struct btrfs_root *root) -{ - int ret; - u32 blocksize; - u64 generation; - - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); - ret = btrfs_find_last_root(tree_root, objectid, - &root->root_item, &root->root_key); - if (ret > 0) - return -ENOENT; - else if (ret < 0) - return ret; - - generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); - root->commit_root = NULL; - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); - if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) { - free_extent_buffer(root->node); - root->node = NULL; - return -EIO; - } - root->commit_root = btrfs_root_node(root); - return 0; -} - -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); - if (root) - root->fs_info = fs_info; - return root; -} - -static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_root *root; - struct btrfs_root *tree_root = fs_info->tree_root; - struct extent_buffer *leaf; - - root = btrfs_alloc_root(fs_info); - if (!root) - return ERR_PTR(-ENOMEM); - - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, BTRFS_TREE_LOG_OBJECTID); - - root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; - root->root_key.type = BTRFS_ROOT_ITEM_KEY; - root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; - /* - * log trees do not get reference counted because they go away - * before a real commit is actually done. They do store pointers - * to file data extents, and those reference counts still get - * updated (along with back refs to the log tree). - */ - root->ref_cows = 0; - - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0, 0); - if (IS_ERR(leaf)) { - kfree(root); - return ERR_CAST(leaf); - } - - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); - root->node = leaf; - - write_extent_buffer(root->node, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(root->node), - BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(root->node); - btrfs_tree_unlock(root->node); - return root; -} - -int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_root *log_root; - - log_root = alloc_log_tree(trans, fs_info); - if (IS_ERR(log_root)) - return PTR_ERR(log_root); - WARN_ON(fs_info->log_root_tree); - fs_info->log_root_tree = log_root; - return 0; -} - -int btrfs_add_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *log_root; - struct btrfs_inode_item *inode_item; - - log_root = alloc_log_tree(trans, root->fs_info); - if (IS_ERR(log_root)) - return PTR_ERR(log_root); - - log_root->last_trans = trans->transid; - log_root->root_key.offset = root->root_key.objectid; - - inode_item = &log_root->root_item.inode; - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - - btrfs_set_root_node(&log_root->root_item, log_root->node); - - WARN_ON(root->log_root); - root->log_root = log_root; - root->log_transid = 0; - root->last_log_commit = 0; - return 0; -} - -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location) -{ - struct btrfs_root *root; - struct btrfs_fs_info *fs_info = tree_root->fs_info; - struct btrfs_path *path; - struct extent_buffer *l; - u64 generation; - u32 blocksize; - int ret = 0; - - root = btrfs_alloc_root(fs_info); - if (!root) - return ERR_PTR(-ENOMEM); - if (location->offset == (u64)-1) { - ret = find_and_setup_root(tree_root, fs_info, - location->objectid, root); - if (ret) { - kfree(root); - return ERR_PTR(ret); - } - goto out; - } - - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, location->objectid); - - path = btrfs_alloc_path(); - if (!path) { - kfree(root); - return ERR_PTR(-ENOMEM); - } - ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret == 0) { - l = path->nodes[0]; - read_extent_buffer(l, &root->root_item, - btrfs_item_ptr_offset(l, path->slots[0]), - sizeof(root->root_item)); - memcpy(&root->root_key, location, sizeof(*location)); - } - btrfs_free_path(path); - if (ret) { - kfree(root); - if (ret > 0) - ret = -ENOENT; - return ERR_PTR(ret); - } - - generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); - root->commit_root = btrfs_root_node(root); - BUG_ON(!root->node); /* -ENOMEM */ -out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { - root->ref_cows = 1; - btrfs_check_and_init_root_item(&root->root_item); - } - - return root; -} - -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) -{ - struct btrfs_root *root; - int ret; - - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; - if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; - if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) - return fs_info->chunk_root; - if (location->objectid == BTRFS_DEV_TREE_OBJECTID) - return fs_info->dev_root; - if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) - return fs_info->csum_root; -again: - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)location->objectid); - spin_unlock(&fs_info->fs_roots_radix_lock); - if (root) - return root; - - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); - if (IS_ERR(root)) - return root; - - root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), - GFP_NOFS); - if (!root->free_ino_pinned || !root->free_ino_ctl) { - ret = -ENOMEM; - goto fail; - } - - btrfs_init_free_ino_ctl(root); - mutex_init(&root->fs_commit_mutex); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); - - ret = get_anon_bdev(&root->anon_dev); - if (ret) - goto fail; - - if (btrfs_root_refs(&root->root_item) == 0) { - ret = -ENOENT; - goto fail; - } - - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); - if (ret < 0) - goto fail; - if (ret == 0) - root->orphan_item_inserted = 1; - - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) - goto fail; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); - if (ret == 0) - root->in_radix = 1; - - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); - if (ret) { - if (ret == -EEXIST) { - free_fs_root(root); - goto again; - } - goto fail; - } - - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - WARN_ON(ret); - return root; -fail: - free_fs_root(root); - return ERR_PTR(ret); -} - -static int btrfs_congested_fn(void *congested_data, int bdi_bits) -{ - struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; - int ret = 0; - struct btrfs_device *device; - struct backing_dev_info *bdi; - - rcu_read_lock(); - list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { - if (!device->bdev) - continue; - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - return ret; -} - -/* - * If this fails, caller must call bdi_destroy() to get rid of the - * bdi again. - */ -static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) -{ - int err; - - bdi->capabilities = BDI_CAP_MAP_COPY; - err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); - if (err) - return err; - - bdi->ra_pages = default_backing_dev_info.ra_pages; - bdi->congested_fn = btrfs_congested_fn; - bdi->congested_data = info; - return 0; -} - -/* - * called by the kthread helper functions to finally call the bio end_io - * functions. This is where read checksum verification actually happens - */ -static void end_workqueue_fn(struct btrfs_work *work) -{ - struct bio *bio; - struct end_io_wq *end_io_wq; - struct btrfs_fs_info *fs_info; - int error; - - end_io_wq = container_of(work, struct end_io_wq, work); - bio = end_io_wq->bio; - fs_info = end_io_wq->info; - - error = end_io_wq->error; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - kfree(end_io_wq); - bio_endio(bio, error); -} - -static int cleaner_kthread(void *arg) -{ - struct btrfs_root *root = arg; - - do { - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && - mutex_trylock(&root->fs_info->cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); - mutex_unlock(&root->fs_info->cleaner_mutex); - btrfs_run_defrag_inodes(root->fs_info); - } - - if (!try_to_freeze()) { - set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -static int transaction_kthread(void *arg) -{ - struct btrfs_root *root = arg; - struct btrfs_trans_handle *trans; - struct btrfs_transaction *cur; - u64 transid; - unsigned long now; - unsigned long delay; - bool cannot_commit; - - do { - cannot_commit = false; - delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - mutex_lock(&root->fs_info->transaction_kthread_mutex); - - spin_lock(&root->fs_info->trans_lock); - cur = root->fs_info->running_transaction; - if (!cur) { - spin_unlock(&root->fs_info->trans_lock); - goto sleep; - } - - now = get_seconds(); - if (!cur->blocked && - (now < cur->start_time || now - cur->start_time < 30)) { - spin_unlock(&root->fs_info->trans_lock); - delay = HZ * 5; - goto sleep; - } - transid = cur->transid; - spin_unlock(&root->fs_info->trans_lock); - - /* If the file system is aborted, this will always fail. */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - cannot_commit = true; - goto sleep; - } - if (transid == trans->transid) { - btrfs_commit_transaction(trans, root); - } else { - btrfs_end_transaction(trans, root); - } -sleep: - wake_up_process(root->fs_info->cleaner_kthread); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); - - if (!try_to_freeze()) { - set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && - (!btrfs_transaction_blocked(root->fs_info) || - cannot_commit)) - schedule_timeout(delay); - __set_current_state(TASK_RUNNING); - } - } while (!kthread_should_stop()); - return 0; -} - -/* - * this will find the highest generation in the array of - * root backups. The index of the highest array is returned, - * or -1 if we can't find anything. - * - * We check to make sure the array is valid by comparing the - * generation of the latest root in the array with the generation - * in the super block. If they don't match we pitch it. - */ -static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) -{ - u64 cur; - int newest_index = -1; - struct btrfs_root_backup *root_backup; - int i; - - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { - root_backup = info->super_copy->super_roots + i; - cur = btrfs_backup_tree_root_gen(root_backup); - if (cur == newest_gen) - newest_index = i; - } - - /* check to see if we actually wrapped around */ - if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { - root_backup = info->super_copy->super_roots; - cur = btrfs_backup_tree_root_gen(root_backup); - if (cur == newest_gen) - newest_index = 0; - } - return newest_index; -} - - -/* - * find the oldest backup so we know where to store new entries - * in the backup array. This will set the backup_root_index - * field in the fs_info struct - */ -static void find_oldest_super_backup(struct btrfs_fs_info *info, - u64 newest_gen) -{ - int newest_index = -1; - - newest_index = find_newest_super_backup(info, newest_gen); - /* if there was garbage in there, just move along */ - if (newest_index == -1) { - info->backup_root_index = 0; - } else { - info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; - } -} - -/* - * copy all the root pointers into the super backup array. - * this will bump the backup pointer by one when it is - * done - */ -static void backup_super_roots(struct btrfs_fs_info *info) -{ - int next_backup; - struct btrfs_root_backup *root_backup; - int last_backup; - - next_backup = info->backup_root_index; - last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % - BTRFS_NUM_BACKUP_ROOTS; - - /* - * just overwrite the last backup if we're at the same generation - * this happens only at umount - */ - root_backup = info->super_for_commit->super_roots + last_backup; - if (btrfs_backup_tree_root_gen(root_backup) == - btrfs_header_generation(info->tree_root->node)) - next_backup = last_backup; - - root_backup = info->super_for_commit->super_roots + next_backup; - - /* - * make sure all of our padding and empty slots get zero filled - * regardless of which ones we use today - */ - memset(root_backup, 0, sizeof(*root_backup)); - - info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; - - btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); - btrfs_set_backup_tree_root_gen(root_backup, - btrfs_header_generation(info->tree_root->node)); - - btrfs_set_backup_tree_root_level(root_backup, - btrfs_header_level(info->tree_root->node)); - - btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); - btrfs_set_backup_chunk_root_gen(root_backup, - btrfs_header_generation(info->chunk_root->node)); - btrfs_set_backup_chunk_root_level(root_backup, - btrfs_header_level(info->chunk_root->node)); - - btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); - btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(info->extent_root->node)); - btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(info->extent_root->node)); - - /* - * we might commit during log recovery, which happens before we set - * the fs_root. Make sure it is valid before we fill it in. - */ - if (info->fs_root && info->fs_root->node) { - btrfs_set_backup_fs_root(root_backup, - info->fs_root->node->start); - btrfs_set_backup_fs_root_gen(root_backup, - btrfs_header_generation(info->fs_root->node)); - btrfs_set_backup_fs_root_level(root_backup, - btrfs_header_level(info->fs_root->node)); - } - - btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); - btrfs_set_backup_dev_root_gen(root_backup, - btrfs_header_generation(info->dev_root->node)); - btrfs_set_backup_dev_root_level(root_backup, - btrfs_header_level(info->dev_root->node)); - - btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); - btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(info->csum_root->node)); - btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(info->csum_root->node)); - - btrfs_set_backup_total_bytes(root_backup, - btrfs_super_total_bytes(info->super_copy)); - btrfs_set_backup_bytes_used(root_backup, - btrfs_super_bytes_used(info->super_copy)); - btrfs_set_backup_num_devices(root_backup, - btrfs_super_num_devices(info->super_copy)); - - /* - * if we don't copy this out to the super_copy, it won't get remembered - * for the next commit - */ - memcpy(&info->super_copy->super_roots, - &info->super_for_commit->super_roots, - sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); -} - -/* - * this copies info out of the root backup array and back into - * the in-memory super block. It is meant to help iterate through - * the array, so you send it the number of backups you've already - * tried and the last backup index you used. - * - * this returns -1 when it has tried all the backups - */ -static noinline int next_root_backup(struct btrfs_fs_info *info, - struct btrfs_super_block *super, - int *num_backups_tried, int *backup_index) -{ - struct btrfs_root_backup *root_backup; - int newest = *backup_index; - - if (*num_backups_tried == 0) { - u64 gen = btrfs_super_generation(super); - - newest = find_newest_super_backup(info, gen); - if (newest == -1) - return -1; - - *backup_index = newest; - *num_backups_tried = 1; - } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { - /* we've tried all the backups, all done */ - return -1; - } else { - /* jump to the next oldest backup */ - newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % - BTRFS_NUM_BACKUP_ROOTS; - *backup_index = newest; - *num_backups_tried += 1; - } - root_backup = super->super_roots + newest; - - btrfs_set_super_generation(super, - btrfs_backup_tree_root_gen(root_backup)); - btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); - btrfs_set_super_root_level(super, - btrfs_backup_tree_root_level(root_backup)); - btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); - - /* - * fixme: the total bytes and num_devices need to match or we should - * need a fsck - */ - btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); - btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); - return 0; -} - -/* helper to cleanup tree roots */ -static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) -{ - free_extent_buffer(info->tree_root->node); - free_extent_buffer(info->tree_root->commit_root); - free_extent_buffer(info->dev_root->node); - free_extent_buffer(info->dev_root->commit_root); - free_extent_buffer(info->extent_root->node); - free_extent_buffer(info->extent_root->commit_root); - free_extent_buffer(info->csum_root->node); - free_extent_buffer(info->csum_root->commit_root); - - info->tree_root->node = NULL; - info->tree_root->commit_root = NULL; - info->dev_root->node = NULL; - info->dev_root->commit_root = NULL; - info->extent_root->node = NULL; - info->extent_root->commit_root = NULL; - info->csum_root->node = NULL; - info->csum_root->commit_root = NULL; - - if (chunk_root) { - free_extent_buffer(info->chunk_root->node); - free_extent_buffer(info->chunk_root->commit_root); - info->chunk_root->node = NULL; - info->chunk_root->commit_root = NULL; - } -} - - -int open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options) -{ - u32 sectorsize; - u32 nodesize; - u32 leafsize; - u32 blocksize; - u32 stripesize; - u64 generation; - u64 features; - struct btrfs_key location; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *tree_root; - struct btrfs_root *extent_root; - struct btrfs_root *csum_root; - struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *log_tree_root; - int ret; - int err = -EINVAL; - int num_backups_tried = 0; - int backup_index = 0; - - tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); - extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info); - csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info); - chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); - dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); - - if (!tree_root || !extent_root || !csum_root || - !chunk_root || !dev_root) { - err = -ENOMEM; - goto fail; - } - - ret = init_srcu_struct(&fs_info->subvol_srcu); - if (ret) { - err = ret; - goto fail; - } - - ret = setup_bdi(fs_info, &fs_info->bdi); - if (ret) { - err = ret; - goto fail_srcu; - } - - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bdi; - } - - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_LIST_HEAD(&fs_info->trans_list); - INIT_LIST_HEAD(&fs_info->dead_roots); - INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); - INIT_LIST_HEAD(&fs_info->delalloc_inodes); - INIT_LIST_HEAD(&fs_info->ordered_operations); - INIT_LIST_HEAD(&fs_info->caching_block_groups); - spin_lock_init(&fs_info->delalloc_lock); - spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->ref_cache_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->delayed_iput_lock); - spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->free_chunk_lock); - mutex_init(&fs_info->reloc_mutex); - - init_completion(&fs_info->kobj_unregister); - INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); - INIT_LIST_HEAD(&fs_info->space_info); - btrfs_mapping_init(&fs_info->mapping_tree); - btrfs_init_block_rsv(&fs_info->global_block_rsv); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); - btrfs_init_block_rsv(&fs_info->trans_block_rsv); - btrfs_init_block_rsv(&fs_info->chunk_block_rsv); - btrfs_init_block_rsv(&fs_info->empty_block_rsv); - btrfs_init_block_rsv(&fs_info->delayed_block_rsv); - atomic_set(&fs_info->nr_async_submits, 0); - atomic_set(&fs_info->async_delalloc_pages, 0); - atomic_set(&fs_info->async_submit_draining, 0); - atomic_set(&fs_info->nr_async_bios, 0); - atomic_set(&fs_info->defrag_running, 0); - fs_info->sb = sb; - fs_info->max_inline = 8192 * 1024; - fs_info->metadata_ratio = 0; - fs_info->defrag_inodes = RB_ROOT; - fs_info->trans_no_join = 0; - fs_info->free_chunk_space = 0; - - /* readahead state */ - INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); - spin_lock_init(&fs_info->reada_lock); - - fs_info->thread_pool_size = min_t(unsigned long, - num_online_cpus() + 2, 8); - - INIT_LIST_HEAD(&fs_info->ordered_extents); - spin_lock_init(&fs_info->ordered_extent_lock); - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), - GFP_NOFS); - if (!fs_info->delayed_root) { - err = -ENOMEM; - goto fail_iput; - } - btrfs_init_delayed_root(fs_info->delayed_root); - - mutex_init(&fs_info->scrub_lock); - atomic_set(&fs_info->scrubs_running, 0); - atomic_set(&fs_info->scrub_pause_req, 0); - atomic_set(&fs_info->scrubs_paused, 0); - atomic_set(&fs_info->scrub_cancel_req, 0); - init_waitqueue_head(&fs_info->scrub_pause_wait); - init_rwsem(&fs_info->scrub_super_lock); - fs_info->scrub_workers_refcnt = 0; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - fs_info->check_integrity_print_mask = 0; -#endif - - spin_lock_init(&fs_info->balance_lock); - mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); - atomic_set(&fs_info->balance_pause_req, 0); - atomic_set(&fs_info->balance_cancel_req, 0); - fs_info->balance_ctl = NULL; - init_waitqueue_head(&fs_info->balance_wait_q); - - sb->s_blocksize = 4096; - sb->s_blocksize_bits = blksize_bits(4096); - sb->s_bdi = &fs_info->bdi; - - fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - set_nlink(fs_info->btree_inode, 1); - /* - * we set the i_size on the btree inode to the max possible int. - * the real end of the address space is determined by all of - * the devices in the system - */ - fs_info->btree_inode->i_size = OFFSET_MAX; - fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; - - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); - extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); - - BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; - insert_inode_hash(fs_info->btree_inode); - - spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree = RB_ROOT; - - extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping); - extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping); - fs_info->pinned_extents = &fs_info->freed_extents[0]; - fs_info->do_barriers = 1; - - - mutex_init(&fs_info->ordered_operations_mutex); - mutex_init(&fs_info->tree_log_mutex); - mutex_init(&fs_info->chunk_mutex); - mutex_init(&fs_info->transaction_kthread_mutex); - mutex_init(&fs_info->cleaner_mutex); - mutex_init(&fs_info->volume_mutex); - init_rwsem(&fs_info->extent_commit_sem); - init_rwsem(&fs_info->cleanup_work_sem); - init_rwsem(&fs_info->subvol_sem); - - btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); - btrfs_init_free_cluster(&fs_info->data_alloc_cluster); - - init_waitqueue_head(&fs_info->transaction_throttle); - init_waitqueue_head(&fs_info->transaction_wait); - init_waitqueue_head(&fs_info->transaction_blocked_wait); - init_waitqueue_head(&fs_info->async_submit_wait); - - __setup_root(4096, 4096, 4096, 4096, tree_root, - fs_info, BTRFS_ROOT_TREE_OBJECTID); - - invalidate_bdev(fs_devices->latest_bdev); - bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) { - err = -EINVAL; - goto fail_alloc; - } - - memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - memcpy(fs_info->super_for_commit, fs_info->super_copy, - sizeof(*fs_info->super_for_commit)); - brelse(bh); - - memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - - disk_super = fs_info->super_copy; - if (!btrfs_super_root(disk_super)) - goto fail_alloc; - - /* check FS state, whether FS is broken. */ - fs_info->fs_state |= btrfs_super_flags(disk_super); - - ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); - if (ret) { - printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); - err = ret; - goto fail_alloc; - } - - /* - * run through our array of backup supers and setup - * our ring pointer to the oldest one - */ - generation = btrfs_super_generation(disk_super); - find_oldest_super_backup(fs_info, generation); - - /* - * In the long term, we'll store the compression type in the super - * block, and it'll be used for per file compression control. - */ - fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - - ret = btrfs_parse_options(tree_root, options); - if (ret) { - err = ret; - goto fail_alloc; - } - - features = btrfs_super_incompat_flags(disk_super) & - ~BTRFS_FEATURE_INCOMPAT_SUPP; - if (features) { - printk(KERN_ERR "BTRFS: couldn't mount because of " - "unsupported optional features (%Lx).\n", - (unsigned long long)features); - err = -EINVAL; - goto fail_alloc; - } - - if (btrfs_super_leafsize(disk_super) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - btrfs_super_leafsize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_leafsize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - - features = btrfs_super_incompat_flags(disk_super); - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - - /* - * flag our filesystem as having big metadata blocks if - * they are bigger than the page size - */ - if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } - - nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); - - /* - * mixed block groups end up with duplicate but slightly offset - * extent buffers for the same range. It leads to corruptions - */ - if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != leafsize)) { - printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); - goto fail_alloc; - } - - btrfs_set_super_incompat_flags(disk_super, features); - - features = btrfs_super_compat_ro_flags(disk_super) & - ~BTRFS_FEATURE_COMPAT_RO_SUPP; - if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " - "unsupported option features (%Lx).\n", - (unsigned long long)features); - err = -EINVAL; - goto fail_alloc; - } - - btrfs_init_workers(&fs_info->generic_worker, - "genwork", 1, NULL); - - btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size, - &fs_info->generic_worker); - - btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, - &fs_info->generic_worker); - - btrfs_init_workers(&fs_info->submit_workers, "submit", - min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), - &fs_info->generic_worker); - - btrfs_init_workers(&fs_info->caching_workers, "cache", - 2, &fs_info->generic_worker); - - /* a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers.idle_thresh = 64; - - fs_info->workers.idle_thresh = 16; - fs_info->workers.ordered = 1; - - fs_info->delalloc_workers.idle_thresh = 2; - fs_info->delalloc_workers.ordered = 1; - - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", - 1, &fs_info->generic_worker); - btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", - fs_info->thread_pool_size, - &fs_info->generic_worker); - btrfs_init_workers(&fs_info->readahead_workers, "readahead", - fs_info->thread_pool_size, - &fs_info->generic_worker); - - /* - * endios are largely parallel and should have a very - * low idle thresh - */ - fs_info->endio_workers.idle_thresh = 4; - fs_info->endio_meta_workers.idle_thresh = 4; - - fs_info->endio_write_workers.idle_thresh = 2; - fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->readahead_workers.idle_thresh = 2; - - /* - * btrfs_start_workers can really only fail because of ENOMEM so just - * return -ENOMEM if any of these fail. - */ - ret = btrfs_start_workers(&fs_info->workers); - ret |= btrfs_start_workers(&fs_info->generic_worker); - ret |= btrfs_start_workers(&fs_info->submit_workers); - ret |= btrfs_start_workers(&fs_info->delalloc_workers); - ret |= btrfs_start_workers(&fs_info->fixup_workers); - ret |= btrfs_start_workers(&fs_info->endio_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_workers); - ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_write_workers); - ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); - ret |= btrfs_start_workers(&fs_info->delayed_workers); - ret |= btrfs_start_workers(&fs_info->caching_workers); - ret |= btrfs_start_workers(&fs_info->readahead_workers); - if (ret) { - ret = -ENOMEM; - goto fail_sb_buffer; - } - - fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); - fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - 4 * 1024 * 1024 / PAGE_CACHE_SIZE); - - tree_root->nodesize = nodesize; - tree_root->leafsize = leafsize; - tree_root->sectorsize = sectorsize; - tree_root->stripesize = stripesize; - - sb->s_blocksize = sectorsize; - sb->s_blocksize_bits = blksize_bits(sectorsize); - - if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, - sizeof(disk_super->magic))) { - printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " - "found on %s\n", (unsigned long)sectorsize, sb->s_id); - goto fail_sb_buffer; - } - - mutex_lock(&fs_info->chunk_mutex); - ret = btrfs_read_sys_array(tree_root); - mutex_unlock(&fs_info->chunk_mutex); - if (ret) { - printk(KERN_WARNING "btrfs: failed to read the system " - "array on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - blocksize = btrfs_level_size(tree_root, - btrfs_super_chunk_root_level(disk_super)); - generation = btrfs_super_chunk_root_generation(disk_super); - - __setup_root(nodesize, leafsize, sectorsize, stripesize, - chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); - - chunk_root->node = read_tree_block(chunk_root, - btrfs_super_chunk_root(disk_super), - blocksize, generation); - BUG_ON(!chunk_root->node); /* -ENOMEM */ - if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", - sb->s_id); - goto fail_tree_roots; - } - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); - - read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), - BTRFS_UUID_SIZE); - - ret = btrfs_read_chunk_tree(chunk_root); - if (ret) { - printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", - sb->s_id); - goto fail_tree_roots; - } - - btrfs_close_extra_devices(fs_devices); - - if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "btrfs: failed to read devices on %s\n", - sb->s_id); - goto fail_tree_roots; - } - -retry_root_backup: - blocksize = btrfs_level_size(tree_root, - btrfs_super_root_level(disk_super)); - generation = btrfs_super_generation(disk_super); - - tree_root->node = read_tree_block(tree_root, - btrfs_super_root(disk_super), - blocksize, generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { - printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", - sb->s_id); - - goto recovery_tree_root; - } - - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_EXTENT_TREE_OBJECTID, extent_root); - if (ret) - goto recovery_tree_root; - extent_root->track_dirty = 1; - - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_DEV_TREE_OBJECTID, dev_root); - if (ret) - goto recovery_tree_root; - dev_root->track_dirty = 1; - - ret = find_and_setup_root(tree_root, fs_info, - BTRFS_CSUM_TREE_OBJECTID, csum_root); - if (ret) - goto recovery_tree_root; - - csum_root->track_dirty = 1; - - fs_info->generation = generation; - fs_info->last_trans_committed = generation; - - ret = btrfs_init_space_info(fs_info); - if (ret) { - printk(KERN_ERR "Failed to initial space info: %d\n", ret); - goto fail_block_groups; - } - - ret = btrfs_read_block_groups(extent_root); - if (ret) { - printk(KERN_ERR "Failed to read block groups: %d\n", ret); - goto fail_block_groups; - } - - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, - "btrfs-cleaner"); - if (IS_ERR(fs_info->cleaner_kthread)) - goto fail_block_groups; - - fs_info->transaction_kthread = kthread_run(transaction_kthread, - tree_root, - "btrfs-transaction"); - if (IS_ERR(fs_info->transaction_kthread)) - goto fail_cleaner; - - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && - !fs_info->fs_devices->rotating) { - printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " - "mode\n"); - btrfs_set_opt(fs_info->mount_opt, SSD); - } - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { - ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? - 1 : 0, - fs_info->check_integrity_print_mask); - if (ret) - printk(KERN_WARNING "btrfs: failed to initialize" - " integrity check module %s\n", sb->s_id); - } -#endif - - /* do not make disk changes in broken FS */ - if (btrfs_super_log_root(disk_super) != 0 && - !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { - u64 bytenr = btrfs_super_log_root(disk_super); - - if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "Btrfs log replay required " - "on RO media\n"); - err = -EIO; - goto fail_trans_kthread; - } - blocksize = - btrfs_level_size(tree_root, - btrfs_super_log_root_level(disk_super)); - - log_tree_root = btrfs_alloc_root(fs_info); - if (!log_tree_root) { - err = -ENOMEM; - goto fail_trans_kthread; - } - - __setup_root(nodesize, leafsize, sectorsize, stripesize, - log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - - log_tree_root->node = read_tree_block(tree_root, bytenr, - blocksize, - generation + 1); - /* returns with log_tree_root freed on success */ - ret = btrfs_recover_log_trees(log_tree_root); - if (ret) { - btrfs_error(tree_root->fs_info, ret, - "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); - goto fail_trans_kthread; - } - - if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - if (ret) - goto fail_trans_kthread; - } - } - - ret = btrfs_find_orphan_roots(tree_root); - if (ret) - goto fail_trans_kthread; - - if (!(sb->s_flags & MS_RDONLY)) { - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) { - } - - ret = btrfs_recover_relocation(tree_root); - if (ret < 0) { - printk(KERN_WARNING - "btrfs: failed to recover relocation\n"); - err = -EINVAL; - goto fail_trans_kthread; - } - } - - location.objectid = BTRFS_FS_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - - fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (!fs_info->fs_root) - goto fail_trans_kthread; - if (IS_ERR(fs_info->fs_root)) { - err = PTR_ERR(fs_info->fs_root); - goto fail_trans_kthread; - } - - if (!(sb->s_flags & MS_RDONLY)) { - down_read(&fs_info->cleanup_work_sem); - err = btrfs_orphan_cleanup(fs_info->fs_root); - if (!err) - err = btrfs_orphan_cleanup(fs_info->tree_root); - up_read(&fs_info->cleanup_work_sem); - - if (!err) - err = btrfs_recover_balance(fs_info->tree_root); - - if (err) { - close_ctree(tree_root); - return err; - } - } - - return 0; - -fail_trans_kthread: - kthread_stop(fs_info->transaction_kthread); -fail_cleaner: - kthread_stop(fs_info->cleaner_kthread); - - /* - * make sure we're done with the btree inode before we stop our - * kthreads - */ - filemap_write_and_wait(fs_info->btree_inode->i_mapping); - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); - -fail_block_groups: - btrfs_free_block_groups(fs_info); - -fail_tree_roots: - free_root_pointers(fs_info, 1); - -fail_sb_buffer: - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); -fail_alloc: -fail_iput: - btrfs_mapping_tree_free(&fs_info->mapping_tree); - - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); - iput(fs_info->btree_inode); -fail_bdi: - bdi_destroy(&fs_info->bdi); -fail_srcu: - cleanup_srcu_struct(&fs_info->subvol_srcu); -fail: - btrfs_close_devices(fs_info->fs_devices); - return err; - -recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) - goto fail_tree_roots; - - free_root_pointers(fs_info, 0); - - /* don't use the log in recovery mode, it won't be valid */ - btrfs_set_super_log_root(disk_super, 0); - - /* we can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - - ret = next_root_backup(fs_info, fs_info->super_copy, - &num_backups_tried, &backup_index); - if (ret == -1) - goto fail_block_groups; - goto retry_root_backup; -} - -static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) -{ - char b[BDEVNAME_SIZE]; - - if (uptodate) { - set_buffer_uptodate(bh); - } else { - printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - /* note, we dont' set_buffer_write_io_error because we have - * our own ways of dealing with the IO errors - */ - clear_buffer_uptodate(bh); - } - unlock_buffer(bh); - put_bh(bh); -} - -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) -{ - struct buffer_head *bh; - struct buffer_head *latest = NULL; - struct btrfs_super_block *super; - int i; - u64 transid = 0; - u64 bytenr; - - /* we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - for (i = 0; i < 1; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) - break; - bh = __bread(bdev, bytenr / 4096, 4096); - if (!bh) - continue; - - super = (struct btrfs_super_block *)bh->b_data; - if (btrfs_super_bytenr(super) != bytenr || - strncmp((char *)(&super->magic), BTRFS_MAGIC, - sizeof(super->magic))) { - brelse(bh); - continue; - } - - if (!latest || btrfs_super_generation(super) > transid) { - brelse(latest); - latest = bh; - transid = btrfs_super_generation(super); - } else { - brelse(bh); - } - } - return latest; -} - -/* - * this should be called twice, once with wait == 0 and - * once with wait == 1. When wait == 0 is done, all the buffer heads - * we write are pinned. - * - * They are released when wait == 1 is done. - * max_mirrors must be the same for both runs, and it indicates how - * many supers on this one device should be written. - * - * max_mirrors == 0 means to write them all. - */ -static int write_dev_supers(struct btrfs_device *device, - struct btrfs_super_block *sb, - int do_barriers, int wait, int max_mirrors) -{ - struct buffer_head *bh; - int i; - int ret; - int errors = 0; - u32 crc; - u64 bytenr; - - if (max_mirrors == 0) - max_mirrors = BTRFS_SUPER_MIRROR_MAX; - - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) - break; - - if (wait) { - bh = __find_get_block(device->bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - BUG_ON(!bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; - - /* drop our reference */ - brelse(bh); - - /* drop the reference from the wait == 0 run */ - brelse(bh); - continue; - } else { - btrfs_set_super_bytenr(sb, bytenr); - - crc = ~(u32)0; - crc = btrfs_csum_data(NULL, (char *)sb + - BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); - - /* - * one reference for us, and we leave it for the - * caller - */ - bh = __getblk(device->bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - - /* one reference for submit_bh */ - get_bh(bh); - - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - } - - /* - * we fua the first super. The others we allow - * to go down lazy. - */ - ret = btrfsic_submit_bh(WRITE_FUA, bh); - if (ret) - errors++; - } - return errors < i ? 0 : -1; -} - -/* - * endio for the write_dev_flush, this will wake anyone waiting - * for the barrier when it is done - */ -static void btrfs_end_empty_barrier(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - if (bio->bi_private) - complete(bio->bi_private); - bio_put(bio); -} - -/* - * trigger flushes for one the devices. If you pass wait == 0, the flushes are - * sent down. With wait == 1, it waits for the previous flush. - * - * any device where the flush fails with eopnotsupp are flagged as not-barrier - * capable - */ -static int write_dev_flush(struct btrfs_device *device, int wait) -{ - struct bio *bio; - int ret = 0; - - if (device->nobarriers) - return 0; - - if (wait) { - bio = device->flush_bio; - if (!bio) - return 0; - - wait_for_completion(&device->flush_wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk("btrfs: disabling barriers on dev %s\n", - device->name); - device->nobarriers = 1; - } - if (!bio_flagged(bio, BIO_UPTODATE)) { - ret = -EIO; - } - - /* drop the reference from the wait == 0 run */ - bio_put(bio); - device->flush_bio = NULL; - - return ret; - } - - /* - * one reference for us, and we leave it for the - * caller - */ - device->flush_bio = NULL;; - bio = bio_alloc(GFP_NOFS, 0); - if (!bio) - return -ENOMEM; - - bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; - init_completion(&device->flush_wait); - bio->bi_private = &device->flush_wait; - device->flush_bio = bio; - - bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); - - return 0; -} - -/* - * send an empty flush down to each device in parallel, - * then wait for them - */ -static int barrier_all_devices(struct btrfs_fs_info *info) -{ - struct list_head *head; - struct btrfs_device *dev; - int errors = 0; - int ret; - - /* send down all the barriers */ - head = &info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (!dev->bdev) { - errors++; - continue; - } - if (!dev->in_fs_metadata || !dev->writeable) - continue; - - ret = write_dev_flush(dev, 0); - if (ret) - errors++; - } - - /* wait for all the barriers */ - list_for_each_entry_rcu(dev, head, dev_list) { - if (!dev->bdev) { - errors++; - continue; - } - if (!dev->in_fs_metadata || !dev->writeable) - continue; - - ret = write_dev_flush(dev, 1); - if (ret) - errors++; - } - if (errors) - return -EIO; - return 0; -} - -int write_all_supers(struct btrfs_root *root, int max_mirrors) -{ - struct list_head *head; - struct btrfs_device *dev; - struct btrfs_super_block *sb; - struct btrfs_dev_item *dev_item; - int ret; - int do_barriers; - int max_errors; - int total_errors = 0; - u64 flags; - - max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; - do_barriers = !btrfs_test_opt(root, NOBARRIER); - backup_super_roots(root->fs_info); - - sb = root->fs_info->super_for_commit; - dev_item = &sb->dev_item; - - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; - - if (do_barriers) - barrier_all_devices(root->fs_info); - - list_for_each_entry_rcu(dev, head, dev_list) { - if (!dev->bdev) { - total_errors++; - continue; - } - if (!dev->in_fs_metadata || !dev->writeable) - continue; - - btrfs_set_stack_device_generation(dev_item, 0); - btrfs_set_stack_device_type(dev_item, dev->type); - btrfs_set_stack_device_id(dev_item, dev->devid); - btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); - btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); - btrfs_set_stack_device_io_align(dev_item, dev->io_align); - btrfs_set_stack_device_io_width(dev_item, dev->io_width); - btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); - memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); - - flags = btrfs_super_flags(sb); - btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); - - ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); - if (ret) - total_errors++; - } - if (total_errors > max_errors) { - printk(KERN_ERR "btrfs: %d errors while writing supers\n", - total_errors); - - /* This shouldn't happen. FUA is masked off if unsupported */ - BUG(); - } - - total_errors = 0; - list_for_each_entry_rcu(dev, head, dev_list) { - if (!dev->bdev) - continue; - if (!dev->in_fs_metadata || !dev->writeable) - continue; - - ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); - if (ret) - total_errors++; - } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - if (total_errors > max_errors) { - btrfs_error(root->fs_info, -EIO, - "%d errors while writing supers", total_errors); - return -EIO; - } - return 0; -} - -int write_ctree_super(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int max_mirrors) -{ - int ret; - - ret = write_all_supers(root, max_mirrors); - return ret; -} - -/* Kill all outstanding I/O */ -void btrfs_abort_devices(struct btrfs_root *root) -{ - struct list_head *head; - struct btrfs_device *dev; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - blk_abort_queue(dev->bdev->bd_disk->queue); - } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); -} - -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) -{ - spin_lock(&fs_info->fs_roots_radix_lock); - radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); - spin_unlock(&fs_info->fs_roots_radix_lock); - - if (btrfs_root_refs(&root->root_item) == 0) - synchronize_srcu(&fs_info->subvol_srcu); - - __btrfs_remove_free_space_cache(root->free_ino_pinned); - __btrfs_remove_free_space_cache(root->free_ino_ctl); - free_fs_root(root); -} - -static void free_fs_root(struct btrfs_root *root) -{ - iput(root->cache_inode); - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_dev) - free_anon_bdev(root->anon_dev); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); - kfree(root->name); - kfree(root); -} - -static void del_fs_roots(struct btrfs_fs_info *fs_info) -{ - int ret; - struct btrfs_root *gang[8]; - int i; - - while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); - - if (gang[0]->in_radix) { - btrfs_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); - kfree(gang[0]); - } - } - - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_free_fs_root(fs_info, gang[i]); - } -} - -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) -{ - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i; - int ret; - - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) - break; - - root_objectid = gang[ret - 1]->root_key.objectid + 1; - for (i = 0; i < ret; i++) { - int err; - - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - return err; - } - root_objectid++; - } - return 0; -} - -int btrfs_commit_super(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - int ret; - - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); - mutex_unlock(&root->fs_info->cleaner_mutex); - - /* wait until ongoing cleanup work done */ - down_write(&root->fs_info->cleanup_work_sem); - up_write(&root->fs_info->cleanup_work_sem); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - ret = btrfs_write_and_wait_transaction(NULL, root); - if (ret) { - btrfs_error(root->fs_info, ret, - "Failed to sync btree inode to disk."); - return ret; - } - - ret = write_ctree_super(NULL, root, 0); - return ret; -} - -int close_ctree(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - - fs_info->closing = 1; - smp_mb(); - - /* pause restriper - we want to resume on mount */ - btrfs_pause_balance(root->fs_info); - - btrfs_scrub_cancel(root); - - /* wait for any defraggers to finish */ - wait_event(fs_info->transaction_wait, - (atomic_read(&fs_info->defrag_running) == 0)); - - /* clear out the rbtree of defraggable inodes */ - btrfs_run_defrag_inodes(fs_info); - - /* - * Here come 2 situations when btrfs is broken to flip readonly: - * - * 1. when btrfs flips readonly somewhere else before - * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, - * and btrfs will skip to write sb directly to keep - * ERROR state on disk. - * - * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannot write sb via btrfs_commit_super, - * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, - * btrfs will cleanup all FS resources first and write sb then. - */ - if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_commit_super(root); - if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); - } - - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - ret = btrfs_error_commit_super(root); - if (ret) - printk(KERN_ERR "btrfs: commit super ret %d\n", ret); - } - - btrfs_put_block_group_cache(fs_info); - - kthread_stop(fs_info->transaction_kthread); - kthread_stop(fs_info->cleaner_kthread); - - fs_info->closing = 2; - smp_mb(); - - if (fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - (unsigned long long)fs_info->delalloc_bytes); - } - if (fs_info->total_ref_cache_size) { - printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", - (unsigned long long)fs_info->total_ref_cache_size); - } - - free_extent_buffer(fs_info->extent_root->node); - free_extent_buffer(fs_info->extent_root->commit_root); - free_extent_buffer(fs_info->tree_root->node); - free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(fs_info->chunk_root->node); - free_extent_buffer(fs_info->chunk_root->commit_root); - free_extent_buffer(fs_info->dev_root->node); - free_extent_buffer(fs_info->dev_root->commit_root); - free_extent_buffer(fs_info->csum_root->node); - free_extent_buffer(fs_info->csum_root->commit_root); - - btrfs_free_block_groups(fs_info); - - del_fs_roots(fs_info); - - iput(fs_info->btree_inode); - - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->readahead_workers); - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) - btrfsic_unmount(root, fs_info->fs_devices); -#endif - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); - - bdi_destroy(&fs_info->bdi); - cleanup_srcu_struct(&fs_info->subvol_srcu); - - return 0; -} - -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic) -{ - int ret; - struct inode *btree_inode = buf->pages[0]->mapping->host; - - ret = extent_buffer_uptodate(buf); - if (!ret) - return ret; - - ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid, atomic); - if (ret == -EAGAIN) - return ret; - return !ret; -} - -int btrfs_set_buffer_uptodate(struct extent_buffer *buf) -{ - return set_extent_buffer_uptodate(buf); -} - -void btrfs_mark_buffer_dirty(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - u64 transid = btrfs_header_generation(buf); - int was_dirty; - - btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) { - printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " - "found %llu running %llu\n", - (unsigned long long)buf->start, - (unsigned long long)transid, - (unsigned long long)root->fs_info->generation); - WARN_ON(1); - } - was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) { - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->dirty_metadata_bytes += buf->len; - spin_unlock(&root->fs_info->delalloc_lock); - } -} - -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) -{ - /* - * looks as though older kernels can get into trouble with - * this code, they end up stuck in balance_dirty_pages forever - */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (current->flags & PF_MEMALLOC) - return; - - btrfs_balance_delayed_items(root); - - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); - } - return; -} - -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) -{ - /* - * looks as though older kernels can get into trouble with - * this code, they end up stuck in balance_dirty_pages forever - */ - u64 num_dirty; - unsigned long thresh = 32 * 1024 * 1024; - - if (current->flags & PF_MEMALLOC) - return; - - num_dirty = root->fs_info->dirty_metadata_bytes; - - if (num_dirty > thresh) { - balance_dirty_pages_ratelimited_nr( - root->fs_info->btree_inode->i_mapping, 1); - } - return; -} - -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) -{ - struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); -} - -static int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)) -{ - struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *eb; - - /* - * We culled this eb but the page is still hanging out on the mapping, - * carry on. - */ - if (!PagePrivate(page)) - goto out; - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - goto out; - } - if (page != eb->pages[0]) - goto out; - - if (!btrfs_try_tree_write_lock(eb)) { - flush_fn(data); - btrfs_tree_lock(eb); - } - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - spin_lock(&root->fs_info->delalloc_lock); - if (root->fs_info->dirty_metadata_bytes >= eb->len) - root->fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&root->fs_info->delalloc_lock); - } - - btrfs_tree_unlock(eb); -out: - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } - return 0; -} - -static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, - int read_only) -{ - if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "btrfs: unsupported checksum algorithm\n"); - return -EINVAL; - } - - if (read_only) - return 0; - - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - printk(KERN_WARNING "warning: mount fs with errors, " - "running btrfsck is recommended\n"); - } - - return 0; -} - -int btrfs_error_commit_super(struct btrfs_root *root) -{ - int ret; - - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(root); - mutex_unlock(&root->fs_info->cleaner_mutex); - - down_write(&root->fs_info->cleanup_work_sem); - up_write(&root->fs_info->cleanup_work_sem); - - /* cleanup FS via transaction */ - btrfs_cleanup_transaction(root); - - ret = write_ctree_super(NULL, root, 0); - - return ret; -} - -static void btrfs_destroy_ordered_operations(struct btrfs_root *root) -{ - struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); - - list_splice_init(&root->fs_info->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - list_del_init(&btrfs_inode->ordered_operations); - - btrfs_invalidate_inodes(btrfs_inode->root); - } - - spin_unlock(&root->fs_info->ordered_extent_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); -} - -static void btrfs_destroy_ordered_extents(struct btrfs_root *root) -{ - struct list_head splice; - struct btrfs_ordered_extent *ordered; - struct inode *inode; - - INIT_LIST_HEAD(&splice); - - spin_lock(&root->fs_info->ordered_extent_lock); - - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - ordered = list_entry(splice.next, struct btrfs_ordered_extent, - root_extent_list); - - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - - /* the inode may be getting freed (in sys_unlink path). */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) - iput(inode); - - atomic_set(&ordered->refs, 1); - btrfs_put_ordered_extent(ordered); - - spin_lock(&root->fs_info->ordered_extent_lock); - } - - spin_unlock(&root->fs_info->ordered_extent_lock); -} - -int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - int ret = 0; - - delayed_refs = &trans->delayed_refs; - -again: - spin_lock(&delayed_refs->lock); - if (delayed_refs->num_entries == 0) { - spin_unlock(&delayed_refs->lock); - printk(KERN_INFO "delayed_refs has NO entry\n"); - return ret; - } - - node = rb_first(&delayed_refs->root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - - atomic_set(&ref->refs, 1); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; - - head = btrfs_delayed_node_to_head(ref); - spin_unlock(&delayed_refs->lock); - mutex_lock(&head->mutex); - kfree(head->extent_op); - delayed_refs->num_heads--; - if (list_empty(&head->cluster)) - delayed_refs->num_heads_ready--; - list_del_init(&head->cluster); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } - spin_unlock(&delayed_refs->lock); - btrfs_put_delayed_ref(ref); - - cond_resched(); - spin_lock(&delayed_refs->lock); - } - - spin_unlock(&delayed_refs->lock); - - return ret; -} - -static void btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) -{ - struct btrfs_pending_snapshot *snapshot; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - list_splice_init(&t->pending_snapshots, &splice); - - while (!list_empty(&splice)) { - snapshot = list_entry(splice.next, - struct btrfs_pending_snapshot, - list); - - list_del_init(&snapshot->list); - - kfree(snapshot); - } -} - -static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) -{ - struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - spin_lock(&root->fs_info->delalloc_lock); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); - - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); - - list_del_init(&btrfs_inode->delalloc_inodes); - - btrfs_invalidate_inodes(btrfs_inode->root); - } - - spin_unlock(&root->fs_info->delalloc_lock); -} - -static int btrfs_destroy_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, - int mark) -{ - int ret; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_buffer *eb; - u64 start = 0; - u64 end; - u64 offset; - unsigned long index; - - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - offset = page_offset(page); - - spin_lock(&dirty_pages->buffer_lock); - eb = radix_tree_lookup( - &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, - offset >> PAGE_CACHE_SHIFT); - spin_unlock(&dirty_pages->buffer_lock); - if (eb) { - ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags); - atomic_set(&eb->refs, 1); - } - if (PageWriteback(page)) - end_page_writeback(page); - - lock_page(page); - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&page->mapping->tree_lock); - } - - page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - } - } - - return ret; -} - -static int btrfs_destroy_pinned_extent(struct btrfs_root *root, - struct extent_io_tree *pinned_extents) -{ - struct extent_io_tree *unpin; - u64 start; - u64 end; - int ret; - - unpin = pinned_extents; - while (1) { - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); - if (ret) - break; - - /* opt_discard */ - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_error_discard_extent(root, start, - end + 1 - start, - NULL); - - clear_extent_dirty(unpin, start, end, GFP_NOFS); - btrfs_error_unpin_extent_range(root, start, end); - cond_resched(); - } - - return 0; -} - -void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_root *root) -{ - btrfs_destroy_delayed_refs(cur_trans, root); - btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, - cur_trans->dirty_pages.dirty_bytes); - - /* FIXME: cleanup wait for commit */ - cur_trans->in_commit = 1; - cur_trans->blocked = 1; - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); - - cur_trans->blocked = 0; - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); - - cur_trans->commit_done = 1; - if (waitqueue_active(&cur_trans->commit_wait)) - wake_up(&cur_trans->commit_wait); - - btrfs_destroy_pending_snapshots(cur_trans); - - btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, - EXTENT_DIRTY); - - /* - memset(cur_trans, 0, sizeof(*cur_trans)); - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - */ -} - -int btrfs_cleanup_transaction(struct btrfs_root *root) -{ - struct btrfs_transaction *t; - LIST_HEAD(list); - - mutex_lock(&root->fs_info->transaction_kthread_mutex); - - spin_lock(&root->fs_info->trans_lock); - list_splice_init(&root->fs_info->trans_list, &list); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); - - while (!list_empty(&list)) { - t = list_entry(list.next, struct btrfs_transaction, list); - if (!t) - break; - - btrfs_destroy_ordered_operations(root); - - btrfs_destroy_ordered_extents(root); - - btrfs_destroy_delayed_refs(t, root); - - btrfs_block_rsv_release(root, - &root->fs_info->trans_block_rsv, - t->dirty_pages.dirty_bytes); - - /* FIXME: cleanup wait for commit */ - t->in_commit = 1; - t->blocked = 1; - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); - - t->blocked = 0; - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); - - t->commit_done = 1; - if (waitqueue_active(&t->commit_wait)) - wake_up(&t->commit_wait); - - btrfs_destroy_pending_snapshots(t); - - btrfs_destroy_delalloc_inodes(root); - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); - - btrfs_destroy_marked_extents(root, &t->dirty_pages, - EXTENT_DIRTY); - - btrfs_destroy_pinned_extent(root, - root->fs_info->pinned_extents); - - atomic_set(&t->use_count, 0); - list_del_init(&t->list); - memset(t, 0, sizeof(*t)); - kmem_cache_free(btrfs_transaction_cachep, t); - } - - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); - - return 0; -} - -static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state) -{ - struct super_block *sb = page->mapping->host->i_sb; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - btrfs_error(fs_info, -EIO, - "Error occured while writing out btree at %llu", start); - return -EIO; -} - -static struct extent_io_ops btree_extent_io_ops = { - .write_cache_pages_lock_hook = btree_lock_page_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, - .submit_bio_hook = btree_submit_bio_hook, - /* note we're sharing with inode.c for the merge bio hook */ - .merge_bio_hook = btrfs_merge_bio_hook, - .writepage_io_failed_hook = btree_writepage_io_failed_hook, -}; diff --git a/ANDROID_3.4.5/fs/btrfs/disk-io.h b/ANDROID_3.4.5/fs/btrfs/disk-io.h deleted file mode 100644 index ab1830aa..00000000 --- a/ANDROID_3.4.5/fs/btrfs/disk-io.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __DISKIO__ -#define __DISKIO__ - -#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) -#define BTRFS_SUPER_INFO_SIZE 4096 - -#define BTRFS_SUPER_MIRROR_MAX 3 -#define BTRFS_SUPER_MIRROR_SHIFT 12 - -static inline u64 btrfs_sb_offset(int mirror) -{ - u64 start = 16 * 1024; - if (mirror) - return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); - return BTRFS_SUPER_INFO_OFFSET; -} - -struct btrfs_device; -struct btrfs_fs_devices; - -struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid); -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid); -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, - int mirror_num, struct extent_buffer **eb); -struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); -void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); -int open_ctree(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - char *options); -int close_ctree(struct btrfs_root *root); -int write_ctree_super(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int max_mirrors); -struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); -int btrfs_commit_super(struct btrfs_root *root); -int btrfs_error_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); -struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, - struct btrfs_key *location); -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location); -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); -void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -void btrfs_mark_buffer_dirty(struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic); -int btrfs_set_buffer_uptodate(struct extent_buffer *buf); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, char *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done); -unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); -int btrfs_write_tree_block(struct extent_buffer *buf); -int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); -int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_add_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_cleanup_transaction(struct btrfs_root *root); -void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, - struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void btrfs_init_lockdep(void); -void btrfs_set_buffer_lockdep_class(u64 objectid, - struct extent_buffer *eb, int level); -#else -static inline void btrfs_init_lockdep(void) -{ } -static inline void btrfs_set_buffer_lockdep_class(u64 objectid, - struct extent_buffer *eb, int level) -{ -} -#endif -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/export.c b/ANDROID_3.4.5/fs/btrfs/export.c deleted file mode 100644 index e887ee62..00000000 --- a/ANDROID_3.4.5/fs/btrfs/export.c +++ /dev/null @@ -1,317 +0,0 @@ -#include <linux/fs.h> -#include <linux/types.h> -#include "ctree.h" -#include "disk-io.h" -#include "btrfs_inode.h" -#include "print-tree.h" -#include "export.h" -#include "compat.h" - -#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ - parent_objectid) / 4) -#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ - parent_root_objectid) / 4) -#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) - -static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, - int connectable) -{ - struct btrfs_fid *fid = (struct btrfs_fid *)fh; - struct inode *inode = dentry->d_inode; - int len = *max_len; - int type; - - if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { - *max_len = BTRFS_FID_SIZE_CONNECTABLE; - return 255; - } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { - *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; - return 255; - } - - len = BTRFS_FID_SIZE_NON_CONNECTABLE; - type = FILEID_BTRFS_WITHOUT_PARENT; - - fid->objectid = btrfs_ino(inode); - fid->root_objectid = BTRFS_I(inode)->root->objectid; - fid->gen = inode->i_generation; - - if (connectable && !S_ISDIR(inode->i_mode)) { - struct inode *parent; - u64 parent_root_id; - - spin_lock(&dentry->d_lock); - - parent = dentry->d_parent->d_inode; - fid->parent_objectid = BTRFS_I(parent)->location.objectid; - fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->objectid; - - spin_unlock(&dentry->d_lock); - - if (parent_root_id != fid->root_objectid) { - fid->parent_root_objectid = parent_root_id; - len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; - type = FILEID_BTRFS_WITH_PARENT_ROOT; - } else { - len = BTRFS_FID_SIZE_CONNECTABLE; - type = FILEID_BTRFS_WITH_PARENT; - } - } - - *max_len = len; - return type; -} - -static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation, - int check_generation) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root; - struct inode *inode; - struct btrfs_key key; - int index; - int err = 0; - - if (objectid < BTRFS_FIRST_FREE_OBJECTID) - return ERR_PTR(-ESTALE); - - key.objectid = root_objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto fail; - } - - if (btrfs_root_refs(&root->root_item) == 0) { - err = -ENOENT; - goto fail; - } - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - inode = btrfs_iget(sb, &key, root, NULL); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto fail; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); - - if (check_generation && generation != inode->i_generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - - return d_obtain_alias(inode); -fail: - srcu_read_unlock(&fs_info->subvol_srcu, index); - return ERR_PTR(err); -} - -static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, - int fh_len, int fh_type) -{ - struct btrfs_fid *fid = (struct btrfs_fid *) fh; - u64 objectid, root_objectid; - u32 generation; - - if (fh_type == FILEID_BTRFS_WITH_PARENT) { - if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) - return NULL; - root_objectid = fid->root_objectid; - } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { - if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) - return NULL; - root_objectid = fid->parent_root_objectid; - } else - return NULL; - - objectid = fid->parent_objectid; - generation = fid->parent_gen; - - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); -} - -static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, - int fh_len, int fh_type) -{ - struct btrfs_fid *fid = (struct btrfs_fid *) fh; - u64 objectid, root_objectid; - u32 generation; - - if ((fh_type != FILEID_BTRFS_WITH_PARENT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE) && - (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && - (fh_type != FILEID_BTRFS_WITHOUT_PARENT || - fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) - return NULL; - - objectid = fid->objectid; - root_objectid = fid->root_objectid; - generation = fid->gen; - - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); -} - -static struct dentry *btrfs_get_parent(struct dentry *child) -{ - struct inode *dir = child->d_inode; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_root_ref *ref; - struct btrfs_key key; - struct btrfs_key found_key; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); - - if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = root->root_key.objectid; - key.type = BTRFS_ROOT_BACKREF_KEY; - key.offset = (u64)-1; - root = root->fs_info->tree_root; - } else { - key.objectid = btrfs_ino(dir); - key.type = BTRFS_INODE_REF_KEY; - key.offset = (u64)-1; - } - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto fail; - - BUG_ON(ret == 0); /* Key with offset of -1 found */ - if (path->slots[0] == 0) { - ret = -ENOENT; - goto fail; - } - - path->slots[0]--; - leaf = path->nodes[0]; - - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != key.objectid || found_key.type != key.type) { - ret = -ENOENT; - goto fail; - } - - if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_root_ref); - key.objectid = btrfs_root_ref_dirid(leaf, ref); - } else { - key.objectid = found_key.offset; - } - btrfs_free_path(path); - - if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { - return btrfs_get_dentry(root->fs_info->sb, key.objectid, - found_key.offset, 0, 0); - } - - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); -fail: - btrfs_free_path(path); - return ERR_PTR(ret); -} - -static int btrfs_get_name(struct dentry *parent, char *name, - struct dentry *child) -{ - struct inode *inode = child->d_inode; - struct inode *dir = parent->d_inode; - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_inode_ref *iref; - struct btrfs_root_ref *rref; - struct extent_buffer *leaf; - unsigned long name_ptr; - struct btrfs_key key; - int name_len; - int ret; - u64 ino; - - if (!dir || !inode) - return -EINVAL; - - if (!S_ISDIR(dir->i_mode)) - return -EINVAL; - - ino = btrfs_ino(inode); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = BTRFS_I(inode)->root->root_key.objectid; - key.type = BTRFS_ROOT_BACKREF_KEY; - key.offset = (u64)-1; - root = root->fs_info->tree_root; - } else { - key.objectid = ino; - key.offset = btrfs_ino(dir); - key.type = BTRFS_INODE_REF_KEY; - } - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - btrfs_free_path(path); - return ret; - } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { - path->slots[0]--; - } else { - btrfs_free_path(path); - return -ENOENT; - } - } - leaf = path->nodes[0]; - - if (ino == BTRFS_FIRST_FREE_OBJECTID) { - rref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_root_ref); - name_ptr = (unsigned long)(rref + 1); - name_len = btrfs_root_ref_name_len(leaf, rref); - } else { - iref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_inode_ref); - name_ptr = (unsigned long)(iref + 1); - name_len = btrfs_inode_ref_name_len(leaf, iref); - } - - read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); - - /* - * have to add the null termination to make sure that reconnect_path - * gets the right len for strlen - */ - name[name_len] = '\0'; - - return 0; -} - -const struct export_operations btrfs_export_ops = { - .encode_fh = btrfs_encode_fh, - .fh_to_dentry = btrfs_fh_to_dentry, - .fh_to_parent = btrfs_fh_to_parent, - .get_parent = btrfs_get_parent, - .get_name = btrfs_get_name, -}; diff --git a/ANDROID_3.4.5/fs/btrfs/export.h b/ANDROID_3.4.5/fs/btrfs/export.h deleted file mode 100644 index 074348a9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/export.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef BTRFS_EXPORT_H -#define BTRFS_EXPORT_H - -#include <linux/exportfs.h> - -extern const struct export_operations btrfs_export_ops; - -struct btrfs_fid { - u64 objectid; - u64 root_objectid; - u32 gen; - - u64 parent_objectid; - u32 parent_gen; - - u64 parent_root_objectid; -} __attribute__ ((packed)); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/extent-tree.c b/ANDROID_3.4.5/fs/btrfs/extent-tree.c deleted file mode 100644 index 49fd7b66..00000000 --- a/ANDROID_3.4.5/fs/btrfs/extent-tree.c +++ /dev/null @@ -1,8025 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/sort.h> -#include <linux/rcupdate.h> -#include <linux/kthread.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> -#include "compat.h" -#include "hash.h" -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" -#include "volumes.h" -#include "locking.h" -#include "free-space-cache.h" - -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum { - CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_LIMITED = 1, - CHUNK_ALLOC_FORCE = 2, -}; - -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - * ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - * bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { - RESERVE_FREE = 0, - RESERVE_ALLOC = 1, - RESERVE_ALLOC_NO_ACCOUNT = 2, -}; - -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op); -static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, - struct extent_buffer *leaf, - struct btrfs_extent_item *ei); -static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); -static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins); -static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force); -static int find_next_key(struct btrfs_path *path, int level, - struct btrfs_key *key); -static void dump_space_info(struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); - -static noinline int -block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED; -} - -static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) -{ - return (cache->flags & bits) == bits; -} - -static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) -{ - atomic_inc(&cache->count); -} - -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) { - WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); - kfree(cache->free_space_ctl); - kfree(cache); - } -} - -/* - * this adds the block group to the fs_info rb tree for the block group - * cache - */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group_cache *block_group) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_block_group_cache *cache; - - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; - - while (*p) { - parent = *p; - cache = rb_entry(parent, struct btrfs_block_group_cache, - cache_node); - if (block_group->key.objectid < cache->key.objectid) { - p = &(*p)->rb_left; - } else if (block_group->key.objectid > cache->key.objectid) { - p = &(*p)->rb_right; - } else { - spin_unlock(&info->block_group_cache_lock); - return -EEXIST; - } - } - - rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); - spin_unlock(&info->block_group_cache_lock); - - return 0; -} - -/* - * This will return the block group at or after bytenr if contains is 0, else - * it will return the block group that contains the bytenr - */ -static struct btrfs_block_group_cache * -block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, - int contains) -{ - struct btrfs_block_group_cache *cache, *ret = NULL; - struct rb_node *n; - u64 end, start; - - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; - - while (n) { - cache = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - end = cache->key.objectid + cache->key.offset - 1; - start = cache->key.objectid; - - if (bytenr < start) { - if (!contains && (!ret || start < ret->key.objectid)) - ret = cache; - n = n->rb_left; - } else if (bytenr > start) { - if (contains && bytenr <= end) { - ret = cache; - break; - } - n = n->rb_right; - } else { - ret = cache; - break; - } - } - if (ret) - btrfs_get_block_group(ret); - spin_unlock(&info->block_group_cache_lock); - - return ret; -} - -static int add_excluded_extent(struct btrfs_root *root, - u64 start, u64 num_bytes) -{ - u64 end = start + num_bytes - 1; - set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); - set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); - return 0; -} - -static void free_excluded_extents(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - u64 start, end; - - start = cache->key.objectid; - end = start + cache->key.offset - 1; - - clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); - clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); -} - -static int exclude_super_stripes(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { - stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; - cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, cache->key.objectid, - stripe_len); - BUG_ON(ret); /* -ENOMEM */ - } - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(&root->fs_info->mapping_tree, - cache->key.objectid, bytenr, - 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ - - while (nr--) { - cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, logical[nr], - stripe_len); - BUG_ON(ret); /* -ENOMEM */ - } - - kfree(logical); - } - return 0; -} - -static struct btrfs_caching_control * -get_caching_control(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *ctl; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_STARTED) { - spin_unlock(&cache->lock); - return NULL; - } - - /* We're loading it the fast way, so we don't have a caching_ctl. */ - if (!cache->caching_ctl) { - spin_unlock(&cache->lock); - return NULL; - } - - ctl = cache->caching_ctl; - atomic_inc(&ctl->count); - spin_unlock(&cache->lock); - return ctl; -} - -static void put_caching_control(struct btrfs_caching_control *ctl) -{ - if (atomic_dec_and_test(&ctl->count)) - kfree(ctl); -} - -/* - * this is only called by cache_block_group, since we could have freed extents - * we need to check the pinned_extents for any extents that can't be used yet - * since their free space will be released as soon as the transaction commits. - */ -static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) -{ - u64 extent_start, extent_end, size, total_added = 0; - int ret; - - while (start < end) { - ret = find_first_extent_bit(info->pinned_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE); - if (ret) - break; - - if (extent_start <= start) { - start = extent_end + 1; - } else if (extent_start > start && extent_start < end) { - size = extent_start - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); - BUG_ON(ret); /* -ENOMEM or logic error */ - start = extent_end + 1; - } else { - break; - } - } - - if (start < end) { - size = end - start; - total_added += size; - ret = btrfs_add_free_space(block_group, start, size); - BUG_ON(ret); /* -ENOMEM or logic error */ - } - - return total_added; -} - -static noinline void caching_thread(struct btrfs_work *work) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; - struct btrfs_root *extent_root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - u64 total_found = 0; - u64 last = 0; - u32 nritems; - int ret = 0; - - caching_ctl = container_of(work, struct btrfs_caching_control, work); - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; - extent_root = fs_info->extent_root; - - path = btrfs_alloc_path(); - if (!path) - goto out; - - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); - - /* - * We don't want to deadlock with somebody trying to allocate a new - * extent for the extent root while also trying to search the extent - * root to add free space. So we skip locking and search the commit - * root, since its read-only - */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = 1; - - key.objectid = last; - key.offset = 0; - key.type = BTRFS_EXTENT_ITEM_KEY; -again: - mutex_lock(&caching_ctl->mutex); - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); - - ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); - if (ret < 0) - goto err; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (btrfs_fs_closing(fs_info) > 1) { - last = (u64)-1; - break; - } - - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - } else { - ret = find_next_key(path, 0, &key); - if (ret) - break; - - if (need_resched() || - btrfs_next_leaf(extent_root, path)) { - caching_ctl->progress = last; - btrfs_release_path(path); - up_read(&fs_info->extent_commit_sem); - mutex_unlock(&caching_ctl->mutex); - cond_resched(); - goto again; - } - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - if (key.objectid < block_group->key.objectid) { - path->slots[0]++; - continue; - } - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (key.type == BTRFS_EXTENT_ITEM_KEY) { - total_found += add_new_free_space(block_group, - fs_info, last, - key.objectid); - last = key.objectid + key.offset; - - if (total_found > (1024 * 1024 * 2)) { - total_found = 0; - wake_up(&caching_ctl->wait); - } - } - path->slots[0]++; - } - ret = 0; - - total_found += add_new_free_space(block_group, fs_info, last, - block_group->key.objectid + - block_group->key.offset); - caching_ctl->progress = (u64)-1; - - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&block_group->lock); - -err: - btrfs_free_path(path); - up_read(&fs_info->extent_commit_sem); - - free_excluded_extents(extent_root, block_group); - - mutex_unlock(&caching_ctl->mutex); -out: - wake_up(&caching_ctl->wait); - - put_caching_control(caching_ctl); - btrfs_put_block_group(block_group); -} - -static int cache_block_group(struct btrfs_block_group_cache *cache, - struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int load_cache_only) -{ - DEFINE_WAIT(wait); - struct btrfs_fs_info *fs_info = cache->fs_info; - struct btrfs_caching_control *caching_ctl; - int ret = 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - if (!caching_ctl) - return -ENOMEM; - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - atomic_set(&caching_ctl->count, 1); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - /* - * This should be a rare occasion, but this could happen I think in the - * case where one thread starts to load the space cache info, and then - * some other thread starts a transaction commit which tries to do an - * allocation while the other thread is still loading the space cache - * info. The previous loop should have kept us from choosing this block - * group, but if we've moved to the state where we will wait on caching - * block groups we need to first check if we're doing a fast load here, - * so we can wait for it to finish, otherwise we could end up allocating - * from a block group who's cache gets evicted for one reason or - * another. - */ - while (cache->cached == BTRFS_CACHE_FAST) { - struct btrfs_caching_control *ctl; - - ctl = cache->caching_ctl; - atomic_inc(&ctl->count); - prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&cache->lock); - - schedule(); - - finish_wait(&ctl->wait, &wait); - put_caching_control(ctl); - spin_lock(&cache->lock); - } - - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); - return 0; - } - WARN_ON(cache->caching_ctl); - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_FAST; - spin_unlock(&cache->lock); - - /* - * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. Also if we are currently trying to - * allocate blocks for the tree root we can't do the fast caching since - * we likely hold important locks. - */ - if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { - ret = load_free_space_cache(fs_info, cache); - - spin_lock(&cache->lock); - if (ret == 1) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_FINISHED; - cache->last_byte_to_unpin = (u64)-1; - } else { - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - } - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); - if (ret == 1) { - put_caching_control(caching_ctl); - free_excluded_extents(fs_info->extent_root, cache); - return 0; - } - } else { - /* - * We are not going to do the fast caching, set cached to the - * appropriate value and wakeup any waiters. - */ - spin_lock(&cache->lock); - if (load_cache_only) { - cache->caching_ctl = NULL; - cache->cached = BTRFS_CACHE_NO; - } else { - cache->cached = BTRFS_CACHE_STARTED; - } - spin_unlock(&cache->lock); - wake_up(&caching_ctl->wait); - } - - if (load_cache_only) { - put_caching_control(caching_ctl); - return 0; - } - - down_write(&fs_info->extent_commit_sem); - atomic_inc(&caching_ctl->count); - list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - up_write(&fs_info->extent_commit_sem); - - btrfs_get_block_group(cache); - - btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work); - - return ret; -} - -/* - * return the block group that starts at or after bytenr - */ -static struct btrfs_block_group_cache * -btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) -{ - struct btrfs_block_group_cache *cache; - - cache = block_group_cache_tree_search(info, bytenr, 0); - - return cache; -} - -/* - * return the block group that contains the given bytenr - */ -struct btrfs_block_group_cache *btrfs_lookup_block_group( - struct btrfs_fs_info *info, - u64 bytenr) -{ - struct btrfs_block_group_cache *cache; - - cache = block_group_cache_tree_search(info, bytenr, 1); - - return cache; -} - -static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, - u64 flags) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); - return found; - } - } - rcu_read_unlock(); - return NULL; -} - -/* - * after adding space to the filesystem, we need to clear the full flags - * on all the space infos. - */ -void btrfs_clear_space_info_full(struct btrfs_fs_info *info) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) - found->full = 0; - rcu_read_unlock(); -} - -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - do_div(num, 100); - return num; -} - -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner) -{ - struct btrfs_block_group_cache *cache; - u64 used; - u64 last = max(search_hint, search_start); - u64 group_start = 0; - int full_search = 0; - int factor = 9; - int wrapped = 0; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - if (!cache) - break; - - spin_lock(&cache->lock); - last = cache->key.objectid + cache->key.offset; - used = btrfs_block_group_used(&cache->item); - - if ((full_search || !cache->ro) && - block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { - if (used + cache->pinned + cache->reserved < - div_factor(cache->key.offset, factor)) { - group_start = cache->key.objectid; - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - goto found; - } - } - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - cond_resched(); - } - if (!wrapped) { - last = search_start; - wrapped = 1; - goto again; - } - if (!full_search && factor < 10) { - last = search_start; - full_search = 1; - factor = 10; - goto again; - } -found: - return group_start; -} - -/* simple helper to search for an existing extent at a given offset */ -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) -{ - int ret; - struct btrfs_key key; - struct btrfs_path *path; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = start; - key.offset = len; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, - 0, 0); - btrfs_free_path(path); - return ret; -} - -/* - * helper function to lookup reference count and flags of extent. - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. - */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) -{ - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - if (!trans) { - path->skip_locking = 1; - path->search_commit_root = 1; - } -again: - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); - if (ret < 0) - goto out_free; - - if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif - } - BUG_ON(num_refs == 0); - } else { - num_refs = 0; - extent_flags = 0; - ret = 0; - } - - if (!trans) - goto out; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (head) { - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's released and try - * again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - goto again; - } - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - - num_refs += head->node.ref_mod; - mutex_unlock(&head->mutex); - } - spin_unlock(&delayed_refs->lock); -out: - WARN_ON(num_refs == 0); - if (refs) - *refs = num_refs; - if (flags) - *flags = extent_flags; -out_free: - btrfs_free_path(path); - return ret; -} - -/* - * Back reference rules. Back refs have three main goals: - * - * 1) differentiate between all holders of references to an extent so that - * when a reference is dropped we can make sure it was a valid reference - * before freeing the extent. - * - * 2) Provide enough information to quickly find the holders of an extent - * if we notice a given block is corrupted or bad. - * - * 3) Make it easy to migrate blocks for FS shrinking or storage pool - * maintenance. This is actually the same as #2, but with a slightly - * different use case. - * - * There are two kinds of back refs. The implicit back refs is optimized - * for pointers in non-shared tree blocks. For a given pointer in a block, - * back refs of this kind provide information about the block's owner tree - * and the pointer's key. These information allow us to find the block by - * b-tree searching. The full back refs is for pointers in tree blocks not - * referenced by their owner trees. The location of tree block is recorded - * in the back refs. Actually the full back refs is generic, and can be - * used in all cases the implicit back refs is used. The major shortcoming - * of the full back refs is its overhead. Every time a tree block gets - * COWed, we have to update back refs entry for all pointers in it. - * - * For a newly allocated tree block, we use implicit back refs for - * pointers in it. This means most tree related operations only involve - * implicit back refs. For a tree block created in old transaction, the - * only way to drop a reference to it is COW it. So we can detect the - * event that tree block loses its owner tree's reference and do the - * back refs conversion. - * - * When a tree block is COW'd through a tree, there are four cases: - * - * The reference count of the block is one and the tree is the block's - * owner tree. Nothing to do in this case. - * - * The reference count of the block is one and the tree is not the - * block's owner tree. In this case, full back refs is used for pointers - * in the block. Remove these full back refs, add implicit back refs for - * every pointers in the new block. - * - * The reference count of the block is greater than one and the tree is - * the block's owner tree. In this case, implicit back refs is used for - * pointers in the block. Add full back refs for every pointers in the - * block, increase lower level extents' reference counts. The original - * implicit back refs are entailed to the new block. - * - * The reference count of the block is greater than one and the tree is - * not the block's owner tree. Add implicit back refs for every pointer in - * the new block, increase lower level extents' reference count. - * - * Back Reference Key composing: - * - * The key objectid corresponds to the first byte in the extent, - * The key type is used to differentiate between types of back refs. - * There are different meanings of the key offset for different types - * of back refs. - * - * File extents can be referenced by: - * - * - multiple snapshots, subvolumes, or different generations in one subvol - * - different files inside a single subvolume - * - different offsets inside a file (bookend extents in file.c) - * - * The extent ref structure for the implicit back refs has fields for: - * - * - Objectid of the subvolume root - * - objectid of the file holding the reference - * - original offset in the file - * - how many bookend extents - * - * The key offset for the implicit back refs is hash of the first - * three fields. - * - * The extent ref structure for the full back refs has field for: - * - * - number of pointers in the tree leaf - * - * The key offset for the implicit back refs is the first byte of - * the tree leaf - * - * When a file extent is allocated, The implicit back refs is used. - * the fields are filled in: - * - * (root_key.objectid, inode objectid, offset in file, 1) - * - * When a file extent is removed file truncation, we find the - * corresponding implicit back refs and check the following fields: - * - * (btrfs_header_owner(leaf), inode objectid, offset in file) - * - * Btree extents can be referenced by: - * - * - Different subvolumes - * - * Both the implicit back refs and the full back refs for tree blocks - * only consist of key. The key offset for the implicit back refs is - * objectid of block's owner tree. The key offset for the full back refs - * is the first byte of parent block. - * - * When implicit back refs is used, information about the lowest key and - * level of the tree block are required. These information are stored in - * tree block info structure. - */ - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static int convert_extent_item_v0(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 owner, u32 extra_size) -{ - struct btrfs_extent_item *item; - struct btrfs_extent_item_v0 *ei0; - struct btrfs_extent_ref_v0 *ref0; - struct btrfs_tree_block_info *bi; - struct extent_buffer *leaf; - struct btrfs_key key; - struct btrfs_key found_key; - u32 new_size = sizeof(*item); - u64 refs; - int ret; - - leaf = path->nodes[0]; - BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - refs = btrfs_extent_refs_v0(leaf, ei0); - - if (owner == (u64)-1) { - while (1) { - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ret; - BUG_ON(ret > 0); /* Corruption */ - leaf = path->nodes[0]; - } - btrfs_item_key_to_cpu(leaf, &found_key, - path->slots[0]); - BUG_ON(key.objectid != found_key.objectid); - if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { - path->slots[0]++; - continue; - } - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - owner = btrfs_ref_objectid_v0(leaf, ref0); - break; - } - } - btrfs_release_path(path); - - if (owner < BTRFS_FIRST_FREE_OBJECTID) - new_size += sizeof(*bi); - - new_size -= sizeof(*ei0); - ret = btrfs_search_slot(trans, root, &key, path, - new_size + extra_size, 1); - if (ret < 0) - return ret; - BUG_ON(ret); /* Corruption */ - - btrfs_extend_item(trans, root, path, new_size); - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, item, refs); - /* FIXME: get real generation */ - btrfs_set_extent_generation(leaf, item, 0); - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - btrfs_set_extent_flags(leaf, item, - BTRFS_EXTENT_FLAG_TREE_BLOCK | - BTRFS_BLOCK_FLAG_FULL_BACKREF); - bi = (struct btrfs_tree_block_info *)(item + 1); - /* FIXME: get first key of the block */ - memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); - btrfs_set_tree_block_level(leaf, bi, (int)owner); - } else { - btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); - } - btrfs_mark_buffer_dirty(leaf); - return 0; -} -#endif - -static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) -{ - u32 high_crc = ~(u32)0; - u32 low_crc = ~(u32)0; - __le64 lenum; - - lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); - lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); - lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); - - return ((u64)high_crc << 31) ^ (u64)low_crc; -} - -static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref) -{ - return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), - btrfs_extent_data_ref_objectid(leaf, ref), - btrfs_extent_data_ref_offset(leaf, ref)); -} - -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) -{ - if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != owner || - btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; -} - -static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, - u64 owner, u64 offset) -{ - struct btrfs_key key; - struct btrfs_extent_data_ref *ref; - struct extent_buffer *leaf; - u32 nritems; - int ret; - int recow; - int err = -ENOENT; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); - } -again: - recow = 0; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } - - if (parent) { - if (!ret) - return 0; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - key.type = BTRFS_EXTENT_REF_V0_KEY; - btrfs_release_path(path); - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (!ret) - return 0; -#endif - goto fail; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - recow = 1; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != bytenr || - key.type != BTRFS_EXTENT_DATA_REF_KEY) - goto fail; - - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) { - if (recow) { - btrfs_release_path(path); - goto again; - } - err = 0; - break; - } - path->slots[0]++; - } -fail: - return err; -} - -static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - u32 size; - u32 num_refs; - int ret; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; - size = sizeof(struct btrfs_shared_data_ref); - } else { - key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); - size = sizeof(struct btrfs_extent_data_ref); - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, size); - if (ret && ret != -EEXIST) - goto fail; - - leaf = path->nodes[0]; - if (parent) { - struct btrfs_shared_data_ref *ref; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); - } else { - num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; - btrfs_set_shared_data_ref_count(leaf, ref, num_refs); - } - } else { - struct btrfs_extent_data_ref *ref; - while (ret == -EEXIST) { - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, - owner, offset)) - break; - btrfs_release_path(path); - key.offset++; - ret = btrfs_insert_empty_item(trans, root, path, &key, - size); - if (ret && ret != -EEXIST) - goto fail; - - leaf = path->nodes[0]; - } - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, ref, owner); - btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); - } else { - num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; - btrfs_set_extent_data_ref_count(leaf, ref, num_refs); - } - } - btrfs_mark_buffer_dirty(leaf); - ret = 0; -fail: - btrfs_release_path(path); - return ret; -} - -static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int refs_to_drop) -{ - struct btrfs_key key; - struct btrfs_extent_data_ref *ref1 = NULL; - struct btrfs_shared_data_ref *ref2 = NULL; - struct extent_buffer *leaf; - u32 num_refs = 0; - int ret = 0; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ref2 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - num_refs = btrfs_ref_count_v0(leaf, ref0); -#endif - } else { - BUG(); - } - - BUG_ON(num_refs < refs_to_drop); - num_refs -= refs_to_drop; - - if (num_refs == 0) { - ret = btrfs_del_item(trans, root, path); - } else { - if (key.type == BTRFS_EXTENT_DATA_REF_KEY) - btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); - else if (key.type == BTRFS_SHARED_DATA_REF_KEY) - btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - else { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - btrfs_set_ref_count_v0(leaf, ref0, num_refs); - } -#endif - btrfs_mark_buffer_dirty(leaf); - } - return ret; -} - -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref1; - struct btrfs_shared_data_ref *ref2; - u32 num_refs = 0; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (iref) { - if (btrfs_extent_inline_ref_type(leaf, iref) == - BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else { - ref2 = (struct btrfs_shared_data_ref *)(iref + 1); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - ref1 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_data_ref); - num_refs = btrfs_extent_data_ref_count(leaf, ref1); - } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - ref2 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_shared_data_ref); - num_refs = btrfs_shared_data_ref_count(leaf, ref2); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref_v0); - num_refs = btrfs_ref_count_v0(leaf, ref0); -#endif - } else { - WARN_ON(1); - } - return num_refs; -} - -static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) -{ - struct btrfs_key key; - int ret; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; - } - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -ENOENT; -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (ret == -ENOENT && parent) { - btrfs_release_path(path); - key.type = BTRFS_EXTENT_REF_V0_KEY; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -ENOENT; - } -#endif - return ret; -} - -static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) -{ - struct btrfs_key key; - int ret; - - key.objectid = bytenr; - if (parent) { - key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; - } else { - key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - btrfs_release_path(path); - return ret; -} - -static inline int extent_ref_type(u64 parent, u64 owner) -{ - int type; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - if (parent > 0) - type = BTRFS_SHARED_BLOCK_REF_KEY; - else - type = BTRFS_TREE_BLOCK_REF_KEY; - } else { - if (parent > 0) - type = BTRFS_SHARED_DATA_REF_KEY; - else - type = BTRFS_EXTENT_DATA_REF_KEY; - } - return type; -} - -static int find_next_key(struct btrfs_path *path, int level, - struct btrfs_key *key) - -{ - for (; level < BTRFS_MAX_LEVEL; level++) { - if (!path->nodes[level]) - break; - if (path->slots[level] + 1 >= - btrfs_header_nritems(path->nodes[level])) - continue; - if (level == 0) - btrfs_item_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - else - btrfs_node_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - return 0; - } - return 1; -} - -/* - * look for inline back ref. if back ref is found, *ref_ret is set - * to the address of inline back ref, and 0 is returned. - * - * if back ref isn't found, *ref_ret is set to the address where it - * should be inserted, and -ENOENT is returned. - * - * if insert is true and there are too many inline back refs, the path - * points to the extent item, and -EAGAIN is returned. - * - * NOTE: inline back refs are ordered in the same way that back ref - * items in the tree are ordered. - */ -static noinline_for_stack -int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref **ref_ret, - u64 bytenr, u64 num_bytes, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int insert) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; - u64 flags; - u64 item_size; - unsigned long ptr; - unsigned long end; - int extra_size; - int type; - int want; - int ret; - int err = 0; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - want = extent_ref_type(parent, owner); - if (insert) { - extra_size = btrfs_extent_inline_ref_size(want); - path->keep_locks = 1; - } else - extra_size = -1; - ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret && !insert) { - err = -ENOENT; - goto out; - } - BUG_ON(ret); /* Corruption */ - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - if (!insert) { - err = -ENOENT; - goto out; - } - ret = convert_extent_item_v0(trans, root, path, owner, - extra_size); - if (ret < 0) { - err = ret; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - } -#endif - BUG_ON(item_size < sizeof(*ei)); - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - flags = btrfs_extent_flags(leaf, ei); - - ptr = (unsigned long)(ei + 1); - end = (unsigned long)ei + item_size; - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ptr += sizeof(struct btrfs_tree_block_info); - BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); - } - - err = -ENOENT; - while (1) { - if (ptr >= end) { - WARN_ON(ptr > end); - break; - } - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(leaf, iref); - if (want < type) - break; - if (want > type) { - ptr += btrfs_extent_inline_ref_size(type); - continue; - } - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - struct btrfs_extent_data_ref *dref; - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - if (match_extent_data_ref(leaf, dref, root_objectid, - owner, offset)) { - err = 0; - break; - } - if (hash_extent_data_ref_item(leaf, dref) < - hash_extent_data_ref(root_objectid, owner, offset)) - break; - } else { - u64 ref_offset; - ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); - if (parent > 0) { - if (parent == ref_offset) { - err = 0; - break; - } - if (ref_offset < parent) - break; - } else { - if (root_objectid == ref_offset) { - err = 0; - break; - } - if (ref_offset < root_objectid) - break; - } - } - ptr += btrfs_extent_inline_ref_size(type); - } - if (err == -ENOENT && insert) { - if (item_size + extra_size >= - BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; - goto out; - } - /* - * To add new inline back ref, we have to make sure - * there is no corresponding back ref item. - * For simplicity, we just do not add new inline back - * ref if there is any kind of item for this block - */ - if (find_next_key(path, 0, &key) == 0 && - key.objectid == bytenr && - key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; - goto out; - } - } - *ref_ret = (struct btrfs_extent_inline_ref *)ptr; -out: - if (insert) { - path->keep_locks = 0; - btrfs_unlock_up_safe(path, 1); - } - return err; -} - -/* - * helper to add new inline back ref - */ -static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - unsigned long ptr; - unsigned long end; - unsigned long item_offset; - u64 refs; - int size; - int type; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - item_offset = (unsigned long)iref - (unsigned long)ei; - - type = extent_ref_type(parent, owner); - size = btrfs_extent_inline_ref_size(type); - - btrfs_extend_item(trans, root, path, size); - - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - refs += refs_to_add; - btrfs_set_extent_refs(leaf, ei, refs); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - - ptr = (unsigned long)ei + item_offset; - end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); - if (ptr < end - size) - memmove_extent_buffer(leaf, ptr + size, ptr, - end - size - ptr); - - iref = (struct btrfs_extent_inline_ref *)ptr; - btrfs_set_extent_inline_ref_type(leaf, iref, type); - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - struct btrfs_extent_data_ref *dref; - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, dref, owner); - btrfs_set_extent_data_ref_offset(leaf, dref, offset); - btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - struct btrfs_shared_data_ref *sref; - sref = (struct btrfs_shared_data_ref *)(iref + 1); - btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else { - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); - } - btrfs_mark_buffer_dirty(leaf); -} - -static int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref **ref_ret, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset) -{ - int ret; - - ret = lookup_inline_extent_backref(trans, root, path, ref_ret, - bytenr, num_bytes, parent, - root_objectid, owner, offset, 0); - if (ret != -ENOENT) - return ret; - - btrfs_release_path(path); - *ref_ret = NULL; - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, - root_objectid); - } else { - ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, - root_objectid, owner, offset); - } - return ret; -} - -/* - * helper to update/remove inline back ref - */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op) -{ - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_data_ref *dref = NULL; - struct btrfs_shared_data_ref *sref = NULL; - unsigned long ptr; - unsigned long end; - u32 item_size; - int size; - int type; - u64 refs; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); - refs += refs_to_mod; - btrfs_set_extent_refs(leaf, ei, refs); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - - type = btrfs_extent_inline_ref_type(leaf, iref); - - if (type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - refs = btrfs_extent_data_ref_count(leaf, dref); - } else if (type == BTRFS_SHARED_DATA_REF_KEY) { - sref = (struct btrfs_shared_data_ref *)(iref + 1); - refs = btrfs_shared_data_ref_count(leaf, sref); - } else { - refs = 1; - BUG_ON(refs_to_mod != -1); - } - - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); - refs += refs_to_mod; - - if (refs > 0) { - if (type == BTRFS_EXTENT_DATA_REF_KEY) - btrfs_set_extent_data_ref_count(leaf, dref, refs); - else - btrfs_set_shared_data_ref_count(leaf, sref, refs); - } else { - size = btrfs_extent_inline_ref_size(type); - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = (unsigned long)iref; - end = (unsigned long)ei + item_size; - if (ptr + size < end) - memmove_extent_buffer(leaf, ptr, ptr + size, - end - ptr - size); - item_size -= size; - btrfs_truncate_item(trans, root, path, item_size, 1); - } - btrfs_mark_buffer_dirty(leaf); -} - -static noinline_for_stack -int insert_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_extent_inline_ref *iref; - int ret; - - ret = lookup_inline_extent_backref(trans, root, path, &iref, - bytenr, num_bytes, parent, - root_objectid, owner, offset, 1); - if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(trans, root, path, iref, - refs_to_add, extent_op); - } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans, root, path, iref, parent, - root_objectid, owner, offset, - refs_to_add, extent_op); - ret = 0; - } - return ret; -} - -static int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add) -{ - int ret; - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); - ret = insert_tree_block_ref(trans, root, path, bytenr, - parent, root_objectid); - } else { - ret = insert_extent_data_ref(trans, root, path, bytenr, - parent, root_objectid, - owner, offset, refs_to_add); - } - return ret; -} - -static int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data) -{ - int ret = 0; - - BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(trans, root, path, iref, - -refs_to_drop, NULL); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop); - } else { - ret = btrfs_del_item(trans, root, path); - } - return ret; -} - -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) -{ - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); -} - -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) -{ - int ret; - u64 discarded_bytes = 0; - struct btrfs_bio *bbio = NULL; - - - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &bbio, 0); - /* Error condition is -ENOMEM */ - if (!ret) { - struct btrfs_bio_stripe *stripe = bbio->stripes; - int i; - - - for (i = 0; i < bbio->num_stripes; i++, stripe++) { - if (!stripe->dev->can_discard) - continue; - - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length); - if (!ret) - discarded_bytes += stripe->length; - else if (ret != -EOPNOTSUPP) - break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; - } - kfree(bbio); - } - - if (actual_bytes) - *actual_bytes = discarded_bytes; - - - return ret; -} - -/* Can return -ENOMEM */ -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int for_cow) -{ - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && - root_objectid == BTRFS_TREE_LOG_OBJECTID); - - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); - } else { - ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, for_cow); - } - return ret; -} - -static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_extent_item *item; - u64 refs; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = 1; - path->leave_spinning = 1; - /* this will setup the path even if it fails to insert the back ref */ - ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, num_bytes, parent, - root_objectid, owner, offset, - refs_to_add, extent_op); - if (ret == 0) - goto out; - - if (ret != -EAGAIN) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, item); - btrfs_set_extent_refs(leaf, item, refs + refs_to_add); - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, item); - - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - path->reada = 1; - path->leave_spinning = 1; - - /* now insert the actual backref */ - ret = insert_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, parent, root_objectid, - owner, offset, refs_to_add); - if (ret) - btrfs_abort_transaction(trans, root, ret); -out: - btrfs_free_path(path); - return err; -} - -static int run_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; - u64 parent = 0; - u64 ref_root = 0; - u64 flags = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ref = btrfs_delayed_node_to_data_ref(node); - if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; - else - ref_root = ref->root; - - if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) { - BUG_ON(extent_op->update_key); - flags |= extent_op->flags_to_set; - } - ret = alloc_reserved_file_extent(trans, root, - parent, ref_root, flags, - ref->objectid, ref->offset, - &ins, node->ref_mod); - } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); - } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); - } else { - BUG(); - } - return ret; -} - -static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, - struct extent_buffer *leaf, - struct btrfs_extent_item *ei) -{ - u64 flags = btrfs_extent_flags(leaf, ei); - if (extent_op->update_flags) { - flags |= extent_op->flags_to_set; - btrfs_set_extent_flags(leaf, ei, flags); - } - - if (extent_op->update_key) { - struct btrfs_tree_block_info *bi; - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - bi = (struct btrfs_tree_block_info *)(ei + 1); - btrfs_set_tree_block_key(leaf, bi, &extent_op->key); - } -} - -static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - u32 item_size; - int ret; - int err = 0; - - if (trans->aborted) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = node->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; - - path->reada = 1; - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, - path, 0, 1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - err = -EIO; - goto out; - } - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - ret = convert_extent_item_v0(trans, root->fs_info->extent_root, - path, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - } -#endif - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - __run_delayed_extent_op(extent_op, leaf, ei); - - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return err; -} - -static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - struct btrfs_delayed_tree_ref *ref; - struct btrfs_key ins; - u64 parent = 0; - u64 ref_root = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - ref = btrfs_delayed_node_to_tree_ref(node); - if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - else - ref_root = ref->root; - - BUG_ON(node->ref_mod != 1); - if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags || - !extent_op->update_key); - ret = alloc_reserved_tree_block(trans, root, - parent, ref_root, - extent_op->flags_to_set, - &extent_op->key, - ref->level, &ins); - } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); - } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op); - } else { - BUG(); - } - return ret; -} - -/* helper function to actually process a single delayed ref entry */ -static int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) -{ - int ret = 0; - - if (trans->aborted) - return 0; - - if (btrfs_delayed_ref_is_head(node)) { - struct btrfs_delayed_ref_head *head; - /* - * we've hit the end of the chain and we were supposed - * to insert this extent into the tree. But, it got - * deleted before we ever needed to insert it, so all - * we have to do is clean up the accounting - */ - BUG_ON(extent_op); - head = btrfs_delayed_node_to_head(node); - if (insert_reserved) { - btrfs_pin_extent(root, node->bytenr, - node->num_bytes, 1); - if (head->is_data) { - ret = btrfs_del_csums(trans, root, - node->bytenr, - node->num_bytes); - } - } - mutex_unlock(&head->mutex); - return ret; - } - - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || - node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, root, node, extent_op, - insert_reserved); - else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || - node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, root, node, extent_op, - insert_reserved); - else - BUG(); - return ret; -} - -static noinline struct btrfs_delayed_ref_node * -select_delayed_ref(struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: - /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. - */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) - return ref; - node = rb_prev(node); - } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; - } - return NULL; -} - -/* - * Returns 0 on success or if called with an already aborted transaction. - * Returns -ENOMEM or -EIO on failure and will abort the transaction. - */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) -{ - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *locked_ref = NULL; - struct btrfs_delayed_extent_op *extent_op; - int ret; - int count = 0; - int must_insert_reserved = 0; - - delayed_refs = &trans->transaction->delayed_refs; - while (1) { - if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) - break; - - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); - - /* grab the lock that says we are going to process - * all the refs for this head */ - ret = btrfs_delayed_ref_lock(trans, locked_ref); - - /* - * we may have dropped the spin lock to get the head - * mutex lock, and that might have given someone else - * time to free the head. If that's true, it has been - * removed from our list and we can move on. - */ - if (ret == -EAGAIN) { - locked_ref = NULL; - count++; - continue; - } - } - - /* - * locked_ref is the head node, so we have to go one - * node back for any delayed ref updates - */ - ref = select_delayed_ref(locked_ref); - - if (ref && ref->seq && - btrfs_check_delayed_seq(delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); - mutex_unlock(&locked_ref->mutex); - locked_ref = NULL; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - cond_resched(); - spin_lock(&delayed_refs->lock); - continue; - } - - /* - * record the must insert reserved flag before we - * drop the spin lock. - */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - - if (!ref) { - /* All delayed refs have been processed, Go ahead - * and send the head node to run_one_delayed_ref, - * so that any accounting fixes can happen - */ - ref = &locked_ref->node; - - if (extent_op && must_insert_reserved) { - kfree(extent_op); - extent_op = NULL; - } - - if (extent_op) { - spin_unlock(&delayed_refs->lock); - - ret = run_delayed_extent_op(trans, root, - ref, extent_op); - kfree(extent_op); - - if (ret) { - printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); - spin_lock(&delayed_refs->lock); - return ret; - } - - goto next; - } - - list_del_init(&locked_ref->cluster); - locked_ref = NULL; - } - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; - /* - * we modified num_entries, but as we're currently running - * delayed refs, skip - * wake_up(&delayed_refs->seq_wait); - * here. - */ - spin_unlock(&delayed_refs->lock); - - ret = run_one_delayed_ref(trans, root, ref, extent_op, - must_insert_reserved); - - btrfs_put_delayed_ref(ref); - kfree(extent_op); - count++; - - if (ret) { - printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); - spin_lock(&delayed_refs->lock); - return ret; - } - -next: - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); - cond_resched(); - spin_lock(&delayed_refs->lock); - } - return count; -} - - -static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs, - unsigned long num_refs) -{ - struct list_head *first_seq = delayed_refs->seq_head.next; - - spin_unlock(&delayed_refs->lock); - pr_debug("waiting for more refs (num %ld, first %p)\n", - num_refs, first_seq); - wait_event(delayed_refs->seq_wait, - num_refs != delayed_refs->num_entries || - delayed_refs->seq_head.next != first_seq); - pr_debug("done waiting for more refs (num %ld, first %p)\n", - delayed_refs->num_entries, delayed_refs->seq_head.next); - spin_lock(&delayed_refs->lock); -} - -/* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. - * - * Returns 0 on success or if called with an aborted transaction - * Returns <0 on error and aborts the transaction - */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count) -{ - struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct list_head cluster; - int ret; - u64 delayed_start; - int run_all = count == (unsigned long)-1; - int run_most = 0; - unsigned long num_refs = 0; - int consider_waiting; - - /* We'll clean this up in btrfs_cleanup_transaction */ - if (trans->aborted) - return 0; - - if (root == root->fs_info->extent_root) - root = root->fs_info->tree_root; - - do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0), - CHUNK_ALLOC_NO_FORCE); - - delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); -again: - consider_waiting = 0; - spin_lock(&delayed_refs->lock); - if (count == 0) { - count = delayed_refs->num_entries * 2; - run_most = 1; - } - while (1) { - if (!(run_all || run_most) && - delayed_refs->num_heads_ready < 64) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (consider_waiting == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked) and if the number of - * refs doesn't change, we avoid busy waiting. - */ - consider_waiting = 1; - num_refs = delayed_refs->num_entries; - } else { - wait_for_more_refs(delayed_refs, num_refs); - /* - * after waiting, things have changed. we - * dropped the lock and someone else might have - * run some refs, built new clusters and so on. - * therefore, we restart staleness detection. - */ - consider_waiting = 0; - } - } - - ret = run_clustered_refs(trans, root, &cluster); - if (ret < 0) { - spin_unlock(&delayed_refs->lock); - btrfs_abort_transaction(trans, root, ret); - return ret; - } - - count -= min_t(unsigned long, ret, count); - - if (count == 0) - break; - - if (ret || delayed_refs->run_delayed_start == 0) { - /* refs were run, let's reset staleness detection */ - consider_waiting = 0; - } - } - - if (run_all) { - node = rb_first(&delayed_refs->root); - if (!node) - goto out; - count = (unsigned long)-1; - - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; - - head = btrfs_delayed_node_to_head(ref); - atomic_inc(&ref->refs); - - spin_unlock(&delayed_refs->lock); - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref(ref); - cond_resched(); - goto again; - } - node = rb_next(node); - } - spin_unlock(&delayed_refs->lock); - schedule_timeout(1); - goto again; - } -out: - spin_unlock(&delayed_refs->lock); - return 0; -} - -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 flags, - int is_data) -{ - struct btrfs_delayed_extent_op *extent_op; - int ret; - - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - if (!extent_op) - return -ENOMEM; - - extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; - - ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, - num_bytes, extent_op); - if (ret) - kfree(extent_op); - return ret; -} - -static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - int ret = 0; - - ret = -ENOENT; - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; - - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(path); - - /* - * Mutex was contended, block until it's released and let - * caller try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - return -EAGAIN; - } - - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; - - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; - - data_ref = btrfs_delayed_node_to_data_ref(ref); - - node = rb_prev(node); - if (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr) - goto out_unlock; - } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: - mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); - return ret; -} - -static noinline int check_committed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_root *extent_root = root->fs_info->extent_root; - struct extent_buffer *leaf; - struct btrfs_extent_data_ref *ref; - struct btrfs_extent_inline_ref *iref; - struct btrfs_extent_item *ei; - struct btrfs_key key; - u32 item_size; - int ret; - - key.objectid = bytenr; - key.offset = (u64)-1; - key.type = BTRFS_EXTENT_ITEM_KEY; - - ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); /* Corruption */ - - ret = -ENOENT; - if (path->slots[0] == 0) - goto out; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) - goto out; - - ret = 1; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - goto out; - } -#endif - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) - goto out; - - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out; - - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - if (btrfs_extent_inline_ref_type(leaf, iref) != - BTRFS_EXTENT_DATA_REF_KEY) - goto out; - - ref = (struct btrfs_extent_data_ref *)(&iref->offset); - if (btrfs_extent_refs(leaf, ei) != - btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || - btrfs_extent_data_ref_objectid(leaf, ref) != objectid || - btrfs_extent_data_ref_offset(leaf, ref) != offset) - goto out; - - ret = 0; -out: - return ret; -} - -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr) -{ - struct btrfs_path *path; - int ret; - int ret2; - - path = btrfs_alloc_path(); - if (!path) - return -ENOENT; - - do { - ret = check_committed_ref(trans, root, path, objectid, - offset, bytenr); - if (ret && ret != -ENOENT) - goto out; - - ret2 = check_delayed_ref(trans, root, path, objectid, - offset, bytenr); - } while (ret2 == -EAGAIN); - - if (ret2 && ret2 != -ENOENT) { - ret = ret2; - goto out; - } - - if (ret != -ENOENT || ret2 != -ENOENT) - ret = 0; -out: - btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) - WARN_ON(ret > 0); - return ret; -} - -static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - int full_backref, int inc, int for_cow) -{ - u64 bytenr; - u64 num_bytes; - u64 parent; - u64 ref_root; - u32 nritems; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - int i; - int level; - int ret = 0; - int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); - - ref_root = btrfs_header_owner(buf); - nritems = btrfs_header_nritems(buf); - level = btrfs_header_level(buf); - - if (!root->ref_cows && level == 0) - return 0; - - if (inc) - process_func = btrfs_inc_extent_ref; - else - process_func = btrfs_free_extent; - - if (full_backref) - parent = buf->start; - else - parent = 0; - - for (i = 0; i < nritems; i++) { - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, i, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); - key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, key.objectid, - key.offset, for_cow); - if (ret) - goto fail; - } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); - ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - for_cow); - if (ret) - goto fail; - } - } - return 0; -fail: - return ret; -} - -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) -{ - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow); -} - -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int for_cow) -{ - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow); -} - -static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_block_group_cache *cache) -{ - int ret; - struct btrfs_root *extent_root = root->fs_info->extent_root; - unsigned long bi; - struct extent_buffer *leaf; - - ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret < 0) - goto fail; - BUG_ON(ret); /* Corruption */ - - leaf = path->nodes[0]; - bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); -fail: - if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - return 0; - -} - -static struct btrfs_block_group_cache * -next_block_group(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - struct rb_node *node; - spin_lock(&root->fs_info->block_group_cache_lock); - node = rb_next(&cache->cache_node); - btrfs_put_block_group(cache); - if (node) { - cache = rb_entry(node, struct btrfs_block_group_cache, - cache_node); - btrfs_get_block_group(cache); - } else - cache = NULL; - spin_unlock(&root->fs_info->block_group_cache_lock); - return cache; -} - -static int cache_save_setup(struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - struct btrfs_root *root = block_group->fs_info->tree_root; - struct inode *inode = NULL; - u64 alloc_hint = 0; - int dcs = BTRFS_DC_ERROR; - int num_pages = 0; - int retries = 0; - int ret = 0; - - /* - * If this block group is smaller than 100 megs don't bother caching the - * block group. - */ - if (block_group->key.offset < (100 * 1024 * 1024)) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } - -again: - inode = lookup_free_space_inode(root, block_group, path); - if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { - ret = PTR_ERR(inode); - btrfs_release_path(path); - goto out; - } - - if (IS_ERR(inode)) { - BUG_ON(retries); - retries++; - - if (block_group->ro) - goto out_free; - - ret = create_free_space_inode(root, trans, block_group, path); - if (ret) - goto out_free; - goto again; - } - - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - - /* - * We want to set the generation to 0, that way if anything goes wrong - * from here on out we know not to trust this cache when we load up next - * time. - */ - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); - WARN_ON(ret); - - if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, path, - inode); - if (ret) - goto out_put; - } - - spin_lock(&block_group->lock); - if (block_group->cached != BTRFS_CACHE_FINISHED) { - /* We're not cached, don't bother trying to write stuff out */ - dcs = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - goto out_put; - } - spin_unlock(&block_group->lock); - - num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); - if (!num_pages) - num_pages = 1; - - /* - * Just to make absolutely sure we have enough space, we're going to - * preallocate 12 pages worth of space for each block group. In - * practice we ought to use at most 8, but we need extra space so we can - * add our header and have a terminator between the extents and the - * bitmaps. - */ - num_pages *= 16; - num_pages *= PAGE_CACHE_SIZE; - - ret = btrfs_check_data_free_space(inode, num_pages); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, - &alloc_hint); - if (!ret) - dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); - -out_put: - iput(inode); -out_free: - btrfs_release_path(path); -out: - spin_lock(&block_group->lock); - if (!ret && dcs == BTRFS_DC_SETUP) - block_group->cache_generation = trans->transid; - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - - return ret; -} - -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_block_group_cache *cache; - int err = 0; - struct btrfs_path *path; - u64 last = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } - - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - if (err) /* File system offline */ - goto out; - - btrfs_put_block_group(cache); - } - - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - err = btrfs_write_out_cache(root, trans, cache, path); - - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } -out: - - btrfs_free_path(path); - return err; -} - -int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) -{ - struct btrfs_block_group_cache *block_group; - int readonly = 0; - - block_group = btrfs_lookup_block_group(root->fs_info, bytenr); - if (!block_group || block_group->ro) - readonly = 1; - if (block_group) - btrfs_put_block_group(block_group); - return readonly; -} - -static int update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - struct btrfs_space_info **space_info) -{ - struct btrfs_space_info *found; - int i; - int factor; - - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - - found = __find_space_info(info, flags); - if (found) { - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->full = 0; - spin_unlock(&found->lock); - *space_info = found; - return 0; - } - found = kzalloc(sizeof(*found), GFP_NOFS); - if (!found) - return -ENOMEM; - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&found->block_groups[i]); - init_rwsem(&found->groups_sem); - spin_lock_init(&found->lock); - found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - found->total_bytes = total_bytes; - found->disk_total = total_bytes * factor; - found->bytes_used = bytes_used; - found->disk_used = bytes_used * factor; - found->bytes_pinned = 0; - found->bytes_reserved = 0; - found->bytes_readonly = 0; - found->bytes_may_use = 0; - found->full = 0; - found->force_alloc = CHUNK_ALLOC_NO_FORCE; - found->chunk_alloc = 0; - found->flush = 0; - init_waitqueue_head(&found->wait); - *space_info = found; - list_add_rcu(&found->list, &info->space_info); - return 0; -} - -static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; -} - -/* - * returns target flags in extended format or 0 if restripe for this - * chunk_type is not in progress - * - * should be called with either volume_mutex or balance_lock held - */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - u64 target = 0; - - if (!bctl) - return 0; - - if (flags & BTRFS_BLOCK_GROUP_DATA && - bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; - } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && - bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; - } else if (flags & BTRFS_BLOCK_GROUP_METADATA && - bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { - target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; - } - - return target; -} - -/* - * @flags: available profiles in extended format (see ctree.h) - * - * Returns reduced profile in chunk format. If profile changing is in - * progress (either running or paused) picks the target profile (if it's - * already available), otherwise falls back to plain reducing. - */ -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) -{ - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - u64 num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; - u64 target; - - /* - * see if restripe for this chunk_type is in progress, if so - * try to reduce to the target profile - */ - spin_lock(&root->fs_info->balance_lock); - target = get_restripe_target(root->fs_info, flags); - if (target) { - /* pick target profile only if it's already available */ - if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&root->fs_info->balance_lock); - return extended_to_chunk(target); - } - } - spin_unlock(&root->fs_info->balance_lock); - - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } - - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } - - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) { - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - } - - return extended_to_chunk(flags); -} - -static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; - else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; - - return btrfs_reduce_alloc_profile(root, flags); -} - -u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) -{ - u64 flags; - - if (data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (root == root->fs_info->chunk_root) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - - return get_alloc_profile(root, flags); -} - -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - BTRFS_BLOCK_GROUP_DATA); -} - -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes) -{ - struct btrfs_space_info *data_sinfo; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 used; - int ret = 0, committed = 0, alloc_chunk = 1; - - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - - if (root == root->fs_info->tree_root || - BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { - alloc_chunk = 0; - committed = 1; - } - - data_sinfo = BTRFS_I(inode)->space_info; - if (!data_sinfo) - goto alloc; - -again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + - data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + - data_sinfo->bytes_may_use; - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full && alloc_chunk) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); -alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, - alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans, root); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else - goto commit_trans; - } - - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } - goto again; - } - - /* - * If we have less pinned bytes than we want to allocate then - * don't bother committing the transaction, it won't help us. - */ - if (data_sinfo->bytes_pinned < bytes) - committed = 1; - spin_unlock(&data_sinfo->lock); - - /* commit the current transaction and try again */ -commit_trans: - if (!committed && - !atomic_read(&root->fs_info->open_ioctl_trans)) { - committed = 1; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; - } - - return -ENOSPC; - } - data_sinfo->bytes_may_use += bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 1); - spin_unlock(&data_sinfo->lock); - - return 0; -} - -/* - * Called if we need to clear a data reservation for this inode. - */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_space_info *data_sinfo; - - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - - data_sinfo = BTRFS_I(inode)->space_info; - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_may_use -= bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); - spin_unlock(&data_sinfo->lock); -} - -static void force_metadata_allocation(struct btrfs_fs_info *info) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = CHUNK_ALLOC_FORCE; - } - rcu_read_unlock(); -} - -static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes, - int force) -{ - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; - u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; - u64 thresh; - - if (force == CHUNK_ALLOC_FORCE) - return 1; - - /* - * We need to take into account the global rsv because for all intents - * and purposes it's used space. Don't worry about locking the - * global_rsv, it doesn't change except when the transaction commits. - */ - num_allocated += global_rsv->size; - - /* - * in limited mode, we want to have some free space up to - * about 1% of the FS size. - */ - if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - thresh = max_t(u64, 64 * 1024 * 1024, - div_factor_fine(thresh, 1)); - - if (num_bytes - num_allocated < thresh) - return 1; - } - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - - /* 256MB or 2% of the FS */ - thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2)); - /* system chunks need a much small threshold */ - if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM) - thresh = 32 * 1024 * 1024; - - if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8)) - return 0; - return 1; -} - -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) -{ - u64 num_dev; - - if (type & BTRFS_BLOCK_GROUP_RAID10 || - type & BTRFS_BLOCK_GROUP_RAID0) - num_dev = root->fs_info->fs_devices->rw_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_dev = 2; - else - num_dev = 1; /* DUP or single */ - - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); -} - -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) -{ - struct btrfs_space_info *info; - u64 left; - u64 thresh; - - info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - spin_lock(&info->lock); - left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; - spin_unlock(&info->lock); - - thresh = get_system_chunk_thresh(root, type); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { - printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", - left, thresh, type); - dump_space_info(info, 0, 0); - } - - if (left < thresh) { - u64 flags; - - flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); - } -} - -static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 alloc_bytes, - u64 flags, int force) -{ - struct btrfs_space_info *space_info; - struct btrfs_fs_info *fs_info = extent_root->fs_info; - int wait_for_alloc = 0; - int ret = 0; - - space_info = __find_space_info(extent_root->fs_info, flags); - if (!space_info) { - ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); - BUG_ON(ret); /* -ENOMEM */ - } - BUG_ON(!space_info); /* Logic error */ - -again: - spin_lock(&space_info->lock); - if (force < space_info->force_alloc) - force = space_info->force_alloc; - if (space_info->full) { - spin_unlock(&space_info->lock); - return 0; - } - - if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { - spin_unlock(&space_info->lock); - return 0; - } else if (space_info->chunk_alloc) { - wait_for_alloc = 1; - } else { - space_info->chunk_alloc = 1; - } - - spin_unlock(&space_info->lock); - - mutex_lock(&fs_info->chunk_mutex); - - /* - * The chunk_mutex is held throughout the entirety of a chunk - * allocation, so once we've acquired the chunk_mutex we know that the - * other guy is done and we need to recheck and see if we should - * allocate. - */ - if (wait_for_alloc) { - mutex_unlock(&fs_info->chunk_mutex); - wait_for_alloc = 0; - goto again; - } - - /* - * If we have mixed data/metadata chunks we want to make sure we keep - * allocating mixed chunks instead of individual chunks. - */ - if (btrfs_mixed_space_info(space_info)) - flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); - - /* - * if we're doing a data chunk, go ahead and make sure that - * we keep a reasonable number of metadata chunks allocated in the - * FS as well. - */ - if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { - fs_info->data_chunk_allocations++; - if (!(fs_info->data_chunk_allocations % - fs_info->metadata_ratio)) - force_metadata_allocation(fs_info); - } - - /* - * Check if we have enough space in SYSTEM chunk because we may need - * to update devices. - */ - check_system_chunk(trans, extent_root, flags); - - ret = btrfs_alloc_chunk(trans, extent_root, flags); - if (ret < 0 && ret != -ENOSPC) - goto out; - - spin_lock(&space_info->lock); - if (ret) - space_info->full = 1; - else - ret = 1; - - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; - space_info->chunk_alloc = 0; - spin_unlock(&space_info->lock); -out: - mutex_unlock(&extent_root->fs_info->chunk_mutex); - return ret; -} - -/* - * shrink metadata reservation for delalloc - */ -static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, - bool wait_ordered) -{ - struct btrfs_block_rsv *block_rsv; - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - u64 reserved; - u64 max_reclaim; - u64 reclaimed = 0; - long time_left; - unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; - int loops = 0; - unsigned long progress; - - trans = (struct btrfs_trans_handle *)current->journal_info; - block_rsv = &root->fs_info->delalloc_block_rsv; - space_info = block_rsv->space_info; - - smp_mb(); - reserved = space_info->bytes_may_use; - progress = space_info->reservation_progress; - - if (reserved == 0) - return 0; - - smp_mb(); - if (root->fs_info->delalloc_bytes == 0) { - if (trans) - return 0; - btrfs_wait_ordered_extents(root, 0, 0); - return 0; - } - - max_reclaim = min(reserved, to_reclaim); - nr_pages = max_t(unsigned long, nr_pages, - max_reclaim >> PAGE_CACHE_SHIFT); - while (loops < 1024) { - /* have the flusher threads jump in and do some IO */ - smp_mb(); - nr_pages = min_t(unsigned long, nr_pages, - root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, - WB_REASON_FS_FREE_SPACE); - - spin_lock(&space_info->lock); - if (reserved > space_info->bytes_may_use) - reclaimed += reserved - space_info->bytes_may_use; - reserved = space_info->bytes_may_use; - spin_unlock(&space_info->lock); - - loops++; - - if (reserved == 0 || reclaimed >= max_reclaim) - break; - - if (trans && trans->transaction->blocked) - return -EAGAIN; - - if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0, 0); - } else { - time_left = schedule_timeout_interruptible(1); - - /* We were interrupted, exit */ - if (time_left) - break; - } - - /* we've kicked the IO a few times, if anything has been freed, - * exit. There is no sense in looping here for a long time - * when we really need to commit the transaction, or there are - * just too many writers without enough free space - */ - - if (loops > 3) { - smp_mb(); - if (progress != space_info->reservation_progress) - break; - } - - } - - return reclaimed >= to_reclaim; -} - -/** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_root *root, - struct btrfs_space_info *space_info, - u64 bytes, int force) -{ - struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; - struct btrfs_trans_handle *trans; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - if (force) - goto commit; - - /* See if there is enough pinned space to make this reservation */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned >= bytes) { - spin_unlock(&space_info->lock); - goto commit; - } - spin_unlock(&space_info->lock); - - /* - * See if there is some space in the delayed insertion reservation for - * this reservation. - */ - if (space_info != delayed_rsv->space_info) - return -ENOSPC; - - spin_lock(&space_info->lock); - spin_lock(&delayed_rsv->lock); - if (space_info->bytes_pinned + delayed_rsv->size < bytes) { - spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); - return -ENOSPC; - } - spin_unlock(&delayed_rsv->lock); - spin_unlock(&space_info->lock); - -commit: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return -ENOSPC; - - return btrfs_commit_transaction(trans, root); -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - wether or not we can flush to make our reservation - * - * This will reserve orgi_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 used; - u64 num_bytes = orig_bytes; - int retries = 0; - int ret = 0; - bool committed = false; - bool flushing = false; - bool wait_ordered = false; - -again: - ret = 0; - spin_lock(&space_info->lock); - /* - * We only want to wait if somebody other than us is flushing and we are - * actually alloed to flush. - */ - while (flush && !flushing && space_info->flush) { - spin_unlock(&space_info->lock); - /* - * If we have a trans handle we can't wait because the flusher - * may have to commit the transaction, which would mean we would - * deadlock since we are waiting for the flusher to finish, but - * hold the current transaction open. - */ - if (current->journal_info) - return -EAGAIN; - ret = wait_event_killable(space_info->wait, !space_info->flush); - /* Must have been killed, return */ - if (ret) - return -EINTR; - - spin_lock(&space_info->lock); - } - - ret = -ENOSPC; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - - /* - * The idea here is that we've not already over-reserved the block group - * then we can go ahead and save our reservation first and then start - * flushing if we need to. Otherwise if we've already overcommitted - * lets start flushing stuff first and then come back and try to make - * our reservation. - */ - if (used <= space_info->total_bytes) { - if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - /* - * Ok set num_bytes to orig_bytes since we aren't - * overocmmitted, this way we only try and reclaim what - * we need. - */ - num_bytes = orig_bytes; - } - } else { - /* - * Ok we're over committed, set num_bytes to the overcommitted - * amount plus the amount of bytes that we need for this - * reservation. - */ - wait_ordered = true; - num_bytes = used - space_info->total_bytes + - (orig_bytes * (retries + 1)); - } - - if (ret) { - u64 profile = btrfs_get_alloc_profile(root, 0); - u64 avail; - - /* - * If we have a lot of space that's pinned, don't bother doing - * the overcommit dance yet and just commit the transaction. - */ - avail = (space_info->total_bytes - space_info->bytes_used) * 8; - do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !committed) { - space_info->flush = 1; - flushing = true; - spin_unlock(&space_info->lock); - ret = may_commit_transaction(root, space_info, - orig_bytes, 1); - if (ret) - goto out; - committed = true; - goto again; - } - - spin_lock(&root->fs_info->free_chunk_lock); - avail = root->fs_info->free_chunk_space; - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. - */ - if (profile & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - avail >>= 1; - - /* - * If we aren't flushing don't let us overcommit too much, say - * 1/8th of the space. If we can flush, let it overcommit up to - * 1/2 of the space. - */ - if (flush) - avail >>= 3; - else - avail >>= 1; - spin_unlock(&root->fs_info->free_chunk_lock); - - if (used + num_bytes < space_info->total_bytes + avail) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - wait_ordered = true; - } - } - - /* - * Couldn't make our reservation, save our place so while we're trying - * to reclaim space we can actually use it instead of somebody else - * stealing it from us. - */ - if (ret && flush) { - flushing = true; - space_info->flush = 1; - } - - spin_unlock(&space_info->lock); - - if (!ret || !flush) - goto out; - - /* - * We do synchronous shrinking since we don't actually unreserve - * metadata until after the IO is completed. - */ - ret = shrink_delalloc(root, num_bytes, wait_ordered); - if (ret < 0) - goto out; - - ret = 0; - - /* - * So if we were overcommitted it's possible that somebody else flushed - * out enough space and we simply didn't have enough space to reclaim, - * so go back around and try again. - */ - if (retries < 2) { - wait_ordered = true; - retries++; - goto again; - } - - ret = -ENOSPC; - if (committed) - goto out; - - ret = may_commit_transaction(root, space_info, orig_bytes, 0); - if (!ret) { - committed = true; - goto again; - } - -out: - if (flushing) { - spin_lock(&space_info->lock); - space_info->flush = 0; - wake_up_all(&space_info->wait); - spin_unlock(&space_info->lock); - } - return ret; -} - -static struct btrfs_block_rsv *get_block_rsv( - const struct btrfs_trans_handle *trans, - const struct btrfs_root *root) -{ - struct btrfs_block_rsv *block_rsv = NULL; - - if (root->ref_cows || root == root->fs_info->csum_root) - block_rsv = trans->block_rsv; - - if (!block_rsv) - block_rsv = root->block_rsv; - - if (!block_rsv) - block_rsv = &root->fs_info->empty_block_rsv; - - return block_rsv; -} - -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { - block_rsv->reserved -= num_bytes; - if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; - ret = 0; - } - spin_unlock(&block_rsv->lock); - return ret; -} - -static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int update_size) -{ - spin_lock(&block_rsv->lock); - block_rsv->reserved += num_bytes; - if (update_size) - block_rsv->size += num_bytes; - else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; - spin_unlock(&block_rsv->lock); -} - -static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - - spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) - num_bytes = block_rsv->size; - block_rsv->size -= num_bytes; - if (block_rsv->reserved >= block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; - } else { - num_bytes = 0; - } - spin_unlock(&block_rsv->lock); - - if (num_bytes > 0) { - if (dest) { - spin_lock(&dest->lock); - if (!dest->full) { - u64 bytes_to_add; - - bytes_to_add = dest->size - dest->reserved; - bytes_to_add = min(num_bytes, bytes_to_add); - dest->reserved += bytes_to_add; - if (dest->reserved >= dest->size) - dest->full = 1; - num_bytes -= bytes_to_add; - } - spin_unlock(&dest->lock); - } - if (num_bytes) { - spin_lock(&space_info->lock); - space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - space_info->reservation_progress++; - spin_unlock(&space_info->lock); - } - } -} - -static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes) -{ - int ret; - - ret = block_rsv_use_bytes(src, num_bytes); - if (ret) - return ret; - - block_rsv_add_bytes(dst, num_bytes, 1); - return 0; -} - -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) -{ - memset(rsv, 0, sizeof(*rsv)); - spin_lock_init(&rsv->lock); -} - -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) -{ - struct btrfs_block_rsv *block_rsv; - struct btrfs_fs_info *fs_info = root->fs_info; - - block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); - if (!block_rsv) - return NULL; - - btrfs_init_block_rsv(block_rsv); - block_rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); - return block_rsv; -} - -void btrfs_free_block_rsv(struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - btrfs_block_rsv_release(root, rsv, (u64)-1); - kfree(rsv); -} - -static inline int __block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int flush) -{ - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } - - return ret; -} - -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 1); -} - -int btrfs_block_rsv_add_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - return __block_rsv_add(root, block_rsv, num_bytes, 0); -} - -int btrfs_block_rsv_check(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int min_factor) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); - if (block_rsv->reserved >= num_bytes) - ret = 0; - spin_unlock(&block_rsv->lock); - - return ret; -} - -static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int flush) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) - ret = 0; - else - num_bytes -= block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (!ret) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); - return 0; - } - - return ret; -} - -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); -} - -int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) -{ - return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); -} - -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes) -{ - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - -void btrfs_block_rsv_release(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || - block_rsv->space_info != global_rsv->space_info) - global_rsv = NULL; - block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, - num_bytes); -} - -/* - * helper to calculate size of global block reservation. - * the desired value is sum of space used by extent tree, - * checksum tree and root tree - */ -static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *sinfo; - u64 num_bytes; - u64 meta_used; - u64 data_used; - int csum_size = btrfs_super_csum_size(fs_info->super_copy); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - spin_lock(&sinfo->lock); - data_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - spin_lock(&sinfo->lock); - if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) - data_used = 0; - meta_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * - csum_size * 2; - num_bytes += div64_u64(data_used + meta_used, 50); - - if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); - - return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); -} - -static void update_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; - struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - - num_bytes = calc_global_metadata_size(fs_info); - - spin_lock(&sinfo->lock); - spin_lock(&block_rsv->lock); - - block_rsv->size = num_bytes; - - num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly + - sinfo->bytes_may_use; - - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - block_rsv->reserved += num_bytes; - sinfo->bytes_may_use += num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 1); - } - - if (block_rsv->reserved >= block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); - sinfo->reservation_progress++; - block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; - } - - spin_unlock(&block_rsv->lock); - spin_unlock(&sinfo->lock); -} - -static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - fs_info->chunk_block_rsv.space_info = space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - fs_info->global_block_rsv.space_info = space_info; - fs_info->delalloc_block_rsv.space_info = space_info; - fs_info->trans_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.space_info = space_info; - fs_info->delayed_block_rsv.space_info = space_info; - - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - - update_global_block_rsv(fs_info); -} - -static void release_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1); - WARN_ON(fs_info->delalloc_block_rsv.size > 0); - WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); - WARN_ON(fs_info->trans_block_rsv.size > 0); - WARN_ON(fs_info->trans_block_rsv.reserved > 0); - WARN_ON(fs_info->chunk_block_rsv.size > 0); - WARN_ON(fs_info->chunk_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_block_rsv.size > 0); - WARN_ON(fs_info->delayed_block_rsv.reserved > 0); -} - -void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!trans->bytes_reserved) - return; - - trace_btrfs_space_reservation(root->fs_info, "transaction", - trans->transid, trans->bytes_reserved, 0); - btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); - trans->bytes_reserved = 0; -} - -/* Can only return 0 or -ENOSPC */ -int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; - - /* - * We need to hold space in order to delete our orphan item once we've - * added it, so this takes the reservation so we can release it later - * when we are truly done with the orphan item. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); - trace_btrfs_space_reservation(root->fs_info, "orphan", - btrfs_ino(inode), num_bytes, 1); - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - -void btrfs_orphan_release_metadata(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); - trace_btrfs_space_reservation(root->fs_info, "orphan", - btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); -} - -int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_root *root = pending->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); - struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; - /* - * two for root back/forward refs, two for directory entries - * and one for root of the snapshot. - */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5); - dst_rsv->space_info = src_rsv->space_info; - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - -/** - * drop_outstanding_extent - drop an outstanding extent - * @inode: the inode we're dropping the extent for - * - * This is called when we are freeing up an outstanding extent, either called - * after an error or after an extent is written. This will return the number of - * reserved extents that need to be freed. This must be called with - * BTRFS_I(inode)->lock held. - */ -static unsigned drop_outstanding_extent(struct inode *inode) -{ - unsigned drop_inode_space = 0; - unsigned dropped_extents = 0; - - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - - if (BTRFS_I(inode)->outstanding_extents == 0 && - BTRFS_I(inode)->delalloc_meta_reserved) { - drop_inode_space = 1; - BTRFS_I(inode)->delalloc_meta_reserved = 0; - } - - /* - * If we have more or the same amount of outsanding extents than we have - * reserved then we need to leave the reserved extents count alone. - */ - if (BTRFS_I(inode)->outstanding_extents >= - BTRFS_I(inode)->reserved_extents) - return drop_inode_space; - - dropped_extents = BTRFS_I(inode)->reserved_extents - - BTRFS_I(inode)->outstanding_extents; - BTRFS_I(inode)->reserved_extents -= dropped_extents; - return dropped_extents + drop_inode_space; -} - -/** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. - * @inode: the inode we're manipulating - * @num_bytes: the number of bytes in question - * @reserve: 1 if we are reserving space, 0 if we are freeing space - * - * This adjusts the number of csum_bytes in the inode and then returns the - * correct amount of metadata that must either be reserved or freed. We - * calculate how many checksums we can fit into one leaf and then divide the - * number of bytes that will need to be checksumed by this value to figure out - * how many checksums will be required. If we are adding bytes then the number - * may go up and we will return the number of additional bytes that must be - * reserved. If it is going down we will return the number of bytes that must - * be freed. - * - * This must be called with BTRFS_I(inode)->lock held. - */ -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, - int reserve) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 csum_size; - int num_csums_per_leaf; - int num_csums; - int old_csums; - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && - BTRFS_I(inode)->csum_bytes == 0) - return 0; - - old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - if (reserve) - BTRFS_I(inode)->csum_bytes += num_bytes; - else - BTRFS_I(inode)->csum_bytes -= num_bytes; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); - num_csums_per_leaf = (int)div64_u64(csum_size, - sizeof(struct btrfs_csum_item) + - sizeof(struct btrfs_disk_key)); - num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - num_csums = num_csums + num_csums_per_leaf - 1; - num_csums = num_csums / num_csums_per_leaf; - - old_csums = old_csums + num_csums_per_leaf - 1; - old_csums = old_csums / num_csums_per_leaf; - - /* No change, no need to reserve more */ - if (old_csums == num_csums) - return 0; - - if (reserve) - return btrfs_calc_trans_metadata_size(root, - num_csums - old_csums); - - return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); -} - -int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; - u64 to_reserve = 0; - u64 csum_bytes; - unsigned nr_extents = 0; - int extra_reserve = 0; - int flush = 1; - int ret; - - /* Need to be holding the i_mutex here if we aren't free space cache */ - if (btrfs_is_free_space_inode(root, inode)) - flush = 0; - - if (flush && btrfs_transaction_in_commit(root->fs_info)) - schedule_timeout(1); - - mutex_lock(&BTRFS_I(inode)->delalloc_mutex); - num_bytes = ALIGN(num_bytes, root->sectorsize); - - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - - if (BTRFS_I(inode)->outstanding_extents > - BTRFS_I(inode)->reserved_extents) - nr_extents = BTRFS_I(inode)->outstanding_extents - - BTRFS_I(inode)->reserved_extents; - - /* - * Add an item to reserve for updating the inode when we complete the - * delalloc io. - */ - if (!BTRFS_I(inode)->delalloc_meta_reserved) { - nr_extents++; - extra_reserve = 1; - } - - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); - to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); - csum_bytes = BTRFS_I(inode)->csum_bytes; - spin_unlock(&BTRFS_I(inode)->lock); - - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); - if (ret) { - u64 to_free = 0; - unsigned dropped; - - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. - */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) - calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); - - if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, - "delalloc", - btrfs_ino(inode), - to_free, 0); - } - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; - } - - spin_lock(&BTRFS_I(inode)->lock); - if (extra_reserve) { - BTRFS_I(inode)->delalloc_meta_reserved = 1; - nr_extents--; - } - BTRFS_I(inode)->reserved_extents += nr_extents; - spin_unlock(&BTRFS_I(inode)->lock); - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - - if (to_reserve) - trace_btrfs_space_reservation(root->fs_info,"delalloc", - btrfs_ino(inode), to_reserve, 1); - block_rsv_add_bytes(block_rsv, to_reserve, 1); - - return 0; -} - -/** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for - * @num_bytes: the number of bytes we're releasing - * - * This will release the metadata reservation for an inode. This can be called - * once we complete IO for a given set of bytes to release their metadata - * reservations. - */ -void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 to_free = 0; - unsigned dropped; - - num_bytes = ALIGN(num_bytes, root->sectorsize); - spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); - - to_free = calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (dropped > 0) - to_free += btrfs_calc_trans_metadata_size(root, dropped); - - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), to_free, 0); - btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, - to_free); -} - -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc - * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate - * - * This will do the following things - * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding - * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes - * o add it to the fs_info's delalloc inodes list. - * - * This will return 0 for success and -ENOSPC if there is no space left. - */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) -{ - int ret; - - ret = btrfs_check_data_free_space(inode, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); - return ret; - } - - return 0; -} - -/** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up - * - * This must be matched with a call to btrfs_delalloc_reserve_space. This is - * called in the case that we don't need the metadata AND data reservations - * anymore. So if there is an error or we insert an inline extent. - * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) -{ - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); -} - -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) -{ - struct btrfs_block_group_cache *cache = NULL; - struct btrfs_fs_info *info = root->fs_info; - u64 total = num_bytes; - u64 old_val; - u64 byte_in_group; - int factor; - - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(info->super_copy); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_super_bytes_used(info->super_copy, old_val); - spin_unlock(&info->delalloc_lock); - - while (total) { - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) - return -ENOENT; - if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, NULL, 1); - - byte_in_group = bytenr - cache->key.objectid; - WARN_ON(byte_in_group > cache->key.offset); - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - - if (btrfs_test_opt(root, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; - - cache->dirty = 1; - old_val = btrfs_block_group_used(&cache->item); - num_bytes = min(total, cache->key.offset - byte_in_group); - if (alloc) { - old_val += num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; - cache->space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - } else { - old_val -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); - } - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; - } - return 0; -} - -static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) -{ - struct btrfs_block_group_cache *cache; - u64 bytenr; - - cache = btrfs_lookup_first_block_group(root->fs_info, search_start); - if (!cache) - return 0; - - bytenr = cache->key.objectid; - btrfs_put_block_group(cache); - - return bytenr; -} - -static int pin_down_extent(struct btrfs_root *root, - struct btrfs_block_group_cache *cache, - u64 bytenr, u64 num_bytes, int reserved) -{ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; - if (reserved) { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - set_extent_dirty(root->fs_info->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - return 0; -} - -/* - * this function must be called within transaction - */ -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved) -{ - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); /* Logic error */ - - pin_down_extent(root, cache, bytenr, num_bytes, reserved); - - btrfs_put_block_group(cache); - return 0; -} - -/* - * this function must be called within transaction - */ -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes) -{ - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); /* Logic error */ - - /* - * pull in the free space cache (if any) so that our pin - * removes the free space from the cache. We have load_only set - * to one because the slow code to read in the free extents does check - * the pinned extents. - */ - cache_block_group(cache, trans, root, 1); - - pin_down_extent(root, cache, bytenr, num_bytes, 0); - - /* remove us from the free space cache (if we're there at all) */ - btrfs_remove_free_space(cache, bytenr, num_bytes); - btrfs_put_block_group(cache); - return 0; -} - -/** - * btrfs_update_reserved_bytes - update the block_group and space info counters - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @reserve: One of the reservation enums - * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk. For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper - * ENOSPC accounting. For data we handle the reservation through clearing the - * delalloc bits in the io_tree. We have to do this since we could end up - * allocating less disk space for the amount of data we have reserved in the - * case of compression. - * - * If this is a reservation and the block group has become read only we cannot - * make the reservation and return -EAGAIN, otherwise this function always - * succeeds. - */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) -{ - struct btrfs_space_info *space_info = cache->space_info; - int ret = 0; - - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (reserve != RESERVE_FREE) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } - } - } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - return ret; -} - -void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_caching_control *next; - struct btrfs_caching_control *caching_ctl; - struct btrfs_block_group_cache *cache; - - down_write(&fs_info->extent_commit_sem); - - list_for_each_entry_safe(caching_ctl, next, - &fs_info->caching_block_groups, list) { - cache = caching_ctl->block_group; - if (block_group_cache_done(cache)) { - cache->last_byte_to_unpin = (u64)-1; - list_del_init(&caching_ctl->list); - put_caching_control(caching_ctl); - } else { - cache->last_byte_to_unpin = caching_ctl->progress; - } - } - - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - fs_info->pinned_extents = &fs_info->freed_extents[1]; - else - fs_info->pinned_extents = &fs_info->freed_extents[0]; - - up_write(&fs_info->extent_commit_sem); - - update_global_block_rsv(fs_info); -} - -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache = NULL; - u64 len; - - while (start <= end) { - if (!cache || - start >= cache->key.objectid + cache->key.offset) { - if (cache) - btrfs_put_block_group(cache); - cache = btrfs_lookup_block_group(fs_info, start); - BUG_ON(!cache); /* Logic error */ - } - - len = cache->key.objectid + cache->key.offset - start; - len = min(len, end + 1 - start); - - if (start < cache->last_byte_to_unpin) { - len = min(len, cache->last_byte_to_unpin - start); - btrfs_add_free_space(cache, start, len); - } - - start += len; - - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - if (cache->ro) - cache->space_info->bytes_readonly += len; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - } - - if (cache) - btrfs_put_block_group(cache); - return 0; -} - -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_io_tree *unpin; - u64 start; - u64 end; - int ret; - - if (trans->aborted) - return 0; - - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) - unpin = &fs_info->freed_extents[1]; - else - unpin = &fs_info->freed_extents[0]; - - while (1) { - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY); - if (ret) - break; - - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_discard_extent(root, start, - end + 1 - start, NULL); - - clear_extent_dirty(unpin, start, end, GFP_NOFS); - unpin_extent_range(root, start, end); - cond_resched(); - } - - return 0; -} - -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_root *extent_root = info->extent_root; - struct extent_buffer *leaf; - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; - int ret; - int is_data; - int extent_slot = 0; - int found_extent = 0; - int num_to_del = 1; - u32 item_size; - u64 refs; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = 1; - path->leave_spinning = 1; - - is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); - - ret = lookup_extent_backref(trans, extent_root, path, &iref, - bytenr, num_bytes, parent, - root_objectid, owner_objectid, - owner_offset); - if (ret == 0) { - extent_slot = path->slots[0]; - while (extent_slot >= 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, - extent_slot); - if (key.objectid != bytenr) - break; - if (key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == num_bytes) { - found_extent = 1; - break; - } - if (path->slots[0] - extent_slot > 5) - break; - extent_slot--; - } -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); - if (found_extent && item_size < sizeof(*ei)) - found_extent = 0; -#endif - if (!found_extent) { - BUG_ON(iref); - ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, - is_data); - if (ret) - goto abort; - btrfs_release_path(path); - path->leave_spinning = 1; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - ret = btrfs_search_slot(trans, extent_root, - &key, path, -1, 1); - if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); - if (ret > 0) - btrfs_print_leaf(extent_root, - path->nodes[0]); - } - if (ret < 0) - goto abort; - extent_slot = path->slots[0]; - } - } else if (ret == -ENOENT) { - btrfs_print_leaf(extent_root, path->nodes[0]); - WARN_ON(1); - printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu owner %llu offset %llu\n", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); - } else { - goto abort; - } - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - BUG_ON(found_extent || extent_slot != path->slots[0]); - ret = convert_extent_item_v0(trans, extent_root, path, - owner_objectid, 0); - if (ret < 0) - goto abort; - - btrfs_release_path(path); - path->leave_spinning = 1; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - - ret = btrfs_search_slot(trans, extent_root, &key, path, - -1, 1); - if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); - btrfs_print_leaf(extent_root, path->nodes[0]); - } - if (ret < 0) - goto abort; - extent_slot = path->slots[0]; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, extent_slot); - } -#endif - BUG_ON(item_size < sizeof(*ei)); - ei = btrfs_item_ptr(leaf, extent_slot, - struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); - bi = (struct btrfs_tree_block_info *)(ei + 1); - WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); - } - - refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs < refs_to_drop); - refs -= refs_to_drop; - - if (refs > 0) { - if (extent_op) - __run_delayed_extent_op(extent_op, leaf, ei); - /* - * In the case of inline back ref, reference count will - * be updated by remove_extent_backref - */ - if (iref) { - BUG_ON(!found_extent); - } else { - btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); - } - if (found_extent) { - ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, - is_data); - if (ret) - goto abort; - } - } else { - if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); - if (iref) { - BUG_ON(path->slots[0] != extent_slot); - } else { - BUG_ON(path->slots[0] != extent_slot + 1); - path->slots[0] = extent_slot; - num_to_del = 2; - } - } - - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], - num_to_del); - if (ret) - goto abort; - btrfs_release_path(path); - - if (is_data) { - ret = btrfs_del_csums(trans, root, bytenr, num_bytes); - if (ret) - goto abort; - } - - ret = update_block_group(trans, root, bytenr, num_bytes, 0); - if (ret) - goto abort; - } -out: - btrfs_free_path(path); - return ret; - -abort: - btrfs_abort_transaction(trans, extent_root, ret); - goto out; -} - -/* - * when we free an block, it is possible (and likely) that we free the last - * delayed ref for that extent as well. This searches the delayed ref tree for - * a given extent, and if there are no other delayed refs to be processed, it - * removes it from the tree. - */ -static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr) -{ - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; - int ret = 0; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; - - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) - goto out; - - if (head->extent_op) { - if (!head->must_insert_reserved) - goto out; - kfree(head->extent_op); - head->extent_op = NULL; - } - - /* - * waiting for the lock here would deadlock. If someone else has it - * locked they are already in the process of dropping it anyway - */ - if (!mutex_trylock(&head->mutex)) - goto out; - - /* - * at this point we have a head with no other entries. Go - * ahead and process it. - */ - head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); - - delayed_refs->num_entries--; - if (waitqueue_active(&delayed_refs->seq_wait)) - wake_up(&delayed_refs->seq_wait); - - /* - * we don't take a ref on the node because we're removing it from the - * tree, so we just steal the ref the tree was holding. - */ - delayed_refs->num_heads--; - if (list_empty(&head->cluster)) - delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); - spin_unlock(&delayed_refs->lock); - - BUG_ON(head->extent_op); - if (head->must_insert_reserved) - ret = 1; - - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); - return ret; -out: - spin_unlock(&delayed_refs->lock); - return 0; -} - -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - u64 parent, int last_ref, int for_cow) -{ - struct btrfs_block_group_cache *cache = NULL; - int ret; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - buf->start, buf->len, - parent, root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, for_cow); - BUG_ON(ret); /* -ENOMEM */ - } - - if (!last_ref) - return; - - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - - if (btrfs_header_generation(buf) == trans->transid) { - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, root, buf->start); - if (!ret) - goto out; - } - - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(root, cache, buf->start, buf->len, 1); - goto out; - } - - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); - - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); - } -out: - /* - * Deleting the buffer, clear the corrupt flag since it doesn't matter - * anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); -} - -/* Can return -ENOMEM */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int for_cow) -{ - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - - /* - * tree log blocks never actually go into the extent allocation - * tree, just update pinning info and exit early. - */ - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { - WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); - /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, 1); - ret = 0; - } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, for_cow); - } else { - ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, for_cow); - } - return ret; -} - -static u64 stripe_align(struct btrfs_root *root, u64 val) -{ - u64 mask = ((u64)root->stripesize - 1); - u64 ret = (val + mask) & ~mask; - return ret; -} - -/* - * when we wait for progress in the block group caching, its because - * our allocation attempt failed at least once. So, we must sleep - * and let some progress happen before we try again. - * - * This function will sleep at least once waiting for new free space to - * show up, and then it will check the block group free space numbers - * for our min num_bytes. Another option is to have it go ahead - * and look in the rbtree for a free extent of a given size, but this - * is a good start. - */ -static noinline int -wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, - u64 num_bytes) -{ - struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); - - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return 0; - - wait_event(caching_ctl->wait, block_group_cache_done(cache) || - (cache->free_space_ctl->free_space >= num_bytes)); - - put_caching_control(caching_ctl); - return 0; -} - -static noinline int -wait_block_group_cache_done(struct btrfs_block_group_cache *cache) -{ - struct btrfs_caching_control *caching_ctl; - DEFINE_WAIT(wait); - - caching_ctl = get_caching_control(cache); - if (!caching_ctl) - return 0; - - wait_event(caching_ctl->wait, block_group_cache_done(cache)); - - put_caching_control(caching_ctl); - return 0; -} - -static int __get_block_group_index(u64 flags) -{ - int index; - - if (flags & BTRFS_BLOCK_GROUP_RAID10) - index = 0; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - index = 1; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - index = 2; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - index = 3; - else - index = 4; - - return index; -} - -static int get_block_group_index(struct btrfs_block_group_cache *cache) -{ - return __get_block_group_index(cache->flags); -} - -enum btrfs_loop_type { - LOOP_CACHING_NOWAIT = 0, - LOOP_CACHING_WAIT = 1, - LOOP_ALLOC_CHUNK = 2, - LOOP_NO_EMPTY_SIZE = 3, -}; - -/* - * walks the btree of allocated extents and find a hole of a given size. - * The key ins is changed to record the hole: - * ins->objectid == block start - * ins->flags = BTRFS_EXTENT_ITEM_KEY - * ins->offset == number of blocks - * Any available blocks before search_start are skipped. - */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *orig_root, - u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 data) -{ - int ret = 0; - struct btrfs_root *root = orig_root->fs_info->extent_root; - struct btrfs_free_cluster *last_ptr = NULL; - struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; - u64 search_start = 0; - int empty_cluster = 2 * 1024 * 1024; - int allowed_chunk_alloc = 0; - int done_chunk_alloc = 0; - struct btrfs_space_info *space_info; - int loop = 0; - int index = 0; - int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? - RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; - bool failed_cluster_refill = false; - bool failed_alloc = false; - bool use_cluster = true; - bool have_caching_bg = false; - - WARN_ON(num_bytes < root->sectorsize); - btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); - ins->objectid = 0; - ins->offset = 0; - - trace_find_free_extent(orig_root, num_bytes, empty_size, data); - - space_info = __find_space_info(root->fs_info, data); - if (!space_info) { - printk(KERN_ERR "No space info for %llu\n", data); - return -ENOSPC; - } - - /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. - */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (orig_root->ref_cows || empty_size) - allowed_chunk_alloc = 1; - - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; - } - - if (last_ptr) { - spin_lock(&last_ptr->lock); - if (last_ptr->block_group) - hint_byte = last_ptr->window_start; - spin_unlock(&last_ptr->lock); - } - - search_start = max(search_start, first_logical_byte(root, 0)); - search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - - if (search_start == hint_byte) { - block_group = btrfs_lookup_block_group(root->fs_info, - search_start); - used_block_group = block_group; - /* - * we don't want to use the block group if it doesn't match our - * allocation bits, or if its not cached. - * - * However if we are re-searching with an ideal block group - * picked out then we don't care that the block group is cached. - */ - if (block_group && block_group_bits(block_group, data) && - block_group->cached != BTRFS_CACHE_NO) { - down_read(&space_info->groups_sem); - if (list_empty(&block_group->list) || - block_group->ro) { - /* - * someone is removing this block group, - * we can't jump into the have_block_group - * target because our list pointers are not - * valid - */ - btrfs_put_block_group(block_group); - up_read(&space_info->groups_sem); - } else { - index = get_block_group_index(block_group); - goto have_block_group; - } - } else if (block_group) { - btrfs_put_block_group(block_group); - } - } -search: - have_caching_bg = false; - down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups[index], - list) { - u64 offset; - int cached; - - used_block_group = block_group; - btrfs_get_block_group(block_group); - search_start = block_group->key.objectid; - - /* - * this can happen if we end up cycling through all the - * raid types, but we want to make sure we only allocate - * for the proper type. - */ - if (!block_group_bits(block_group, data)) { - u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - - /* - * if they asked for extra copies and this block group - * doesn't provide them, bail. This does allow us to - * fill raid0 from raid1. - */ - if ((data & extra) && !(block_group->flags & extra)) - goto loop; - } - -have_block_group: - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { - found_uncached_bg = true; - ret = cache_block_group(block_group, trans, - orig_root, 0); - BUG_ON(ret < 0); - ret = 0; - } - - if (unlikely(block_group->ro)) - goto loop; - - /* - * Ok we want to try and use the cluster allocator, so - * lets look there - */ - if (last_ptr) { - /* - * the refill lock keeps out other - * people trying to start a new cluster - */ - spin_lock(&last_ptr->refill_lock); - used_block_group = last_ptr->block_group; - if (used_block_group != block_group && - (!used_block_group || - used_block_group->ro || - !block_group_bits(used_block_group, data))) { - used_block_group = block_group; - goto refill_cluster; - } - - if (used_block_group != block_group) - btrfs_get_block_group(used_block_group); - - offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, num_bytes, used_block_group->key.objectid); - if (offset) { - /* we have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); - goto checks; - } - - WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { - btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } -refill_cluster: - BUG_ON(used_block_group != block_group); - /* If we are on LOOP_NO_EMPTY_SIZE, we can't - * set up a new clusters, so lets just skip it - * and let the allocator find whatever block - * it can find. If we reach this point, we - * will have tried the cluster allocator - * plenty of times and not have found - * anything, so we are likely way too - * fragmented for the clustering stuff to find - * anything. - * - * However, if the cluster is taken from the - * current block group, release the cluster - * first, so that we stand a better chance of - * succeeding in the unclustered - * allocation. */ - if (loop >= LOOP_NO_EMPTY_SIZE && - last_ptr->block_group != block_group) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - - /* - * this cluster didn't work out, free it and - * start over - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - - if (loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - - /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, - block_group, last_ptr, - search_start, num_bytes, - empty_cluster + empty_size); - if (ret == 0) { - /* - * now pull our allocation out of this - * cluster - */ - offset = btrfs_alloc_from_cluster(block_group, - last_ptr, num_bytes, - search_start); - if (offset) { - /* we found one, proceed */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, - num_bytes); - goto checks; - } - } else if (!cached && loop > LOOP_CACHING_NOWAIT - && !failed_cluster_refill) { - spin_unlock(&last_ptr->refill_lock); - - failed_cluster_refill = true; - wait_block_group_cache_progress(block_group, - num_bytes + empty_cluster + empty_size); - goto have_block_group; - } - - /* - * at this point we either didn't find a cluster - * or we weren't able to allocate a block from our - * cluster. Free the cluster we've been trying - * to use, and go to the next block group - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } - -unclustered_alloc: - spin_lock(&block_group->free_space_ctl->tree_lock); - if (cached && - block_group->free_space_ctl->free_space < - num_bytes + empty_cluster + empty_size) { - spin_unlock(&block_group->free_space_ctl->tree_lock); - goto loop; - } - spin_unlock(&block_group->free_space_ctl->tree_lock); - - offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size); - /* - * If we didn't find a chunk, and we haven't failed on this - * block group before, and this block group is in the middle of - * caching and we are ok with waiting, then go ahead and wait - * for progress to be made, and set failed_alloc to true. - * - * If failed_alloc is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !failed_alloc && !cached && - loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(block_group, - num_bytes + empty_size); - failed_alloc = true; - goto have_block_group; - } else if (!offset) { - if (!cached) - have_caching_bg = true; - goto loop; - } -checks: - search_start = stripe_align(root, offset); - - /* move on to the next group */ - if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } - - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); - - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, - alloc_type); - if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); - goto loop; - } - - /* we are all good, lets return */ - ins->objectid = search_start; - ins->offset = num_bytes; - - trace_btrfs_reserve_extent(orig_root, block_group, - search_start, num_bytes); - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); - break; -loop: - failed_cluster_refill = false; - failed_alloc = false; - BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); - btrfs_put_block_group(block_group); - } - up_read(&space_info->groups_sem); - - if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) - goto search; - - if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) - goto search; - - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { - index = 0; - loop++; - if (loop == LOOP_ALLOC_CHUNK) { - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - if (ret < 0) { - btrfs_abort_transaction(trans, - root, ret); - goto out; - } - allowed_chunk_alloc = 0; - if (ret == 1) - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == - CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; - } - - /* - * We didn't allocate a chunk, go ahead and drop the - * empty size and loop again. - */ - if (!done_chunk_alloc) - loop = LOOP_NO_EMPTY_SIZE; - } - - if (loop == LOOP_NO_EMPTY_SIZE) { - empty_size = 0; - empty_cluster = 0; - } - - goto search; - } else if (!ins->objectid) { - ret = -ENOSPC; - } else if (ins->objectid) { - ret = 0; - } -out: - - return ret; -} - -static void dump_space_info(struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) -{ - struct btrfs_block_group_cache *cache; - int index = 0; - - spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", - (unsigned long long)info->flags, - (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved - - info->bytes_readonly), - (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " - "reserved=%llu, may_use=%llu, readonly=%llu\n", - (unsigned long long)info->total_bytes, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_reserved, - (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_readonly); - spin_unlock(&info->lock); - - if (!dump_block_groups) - return; - - down_read(&info->groups_sem); -again: - list_for_each_entry(cache, &info->block_groups[index], list) { - spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used " - "%llu pinned %llu reserved\n", - (unsigned long long)cache->key.objectid, - (unsigned long long)cache->key.offset, - (unsigned long long)btrfs_block_group_used(&cache->item), - (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved); - btrfs_dump_free_space(cache, bytes); - spin_unlock(&cache->lock); - } - if (++index < BTRFS_NR_RAID_TYPES) - goto again; - up_read(&info->groups_sem); -} - -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data) -{ - bool final_tried = false; - int ret; - - data = btrfs_get_alloc_profile(root, data); -again: - /* - * the only place that sets empty_size is btrfs_realloc_node, which - * is not called recursively on allocations - */ - if (empty_size || root->ref_cows) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, - CHUNK_ALLOC_NO_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - } - - WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, data); - - if (ret == -ENOSPC) { - if (!final_tried) { - num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); - num_bytes = max(num_bytes, min_alloc_size); - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - if (ret < 0 && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - if (num_bytes == min_alloc_size) - final_tried = true; - goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - struct btrfs_space_info *sinfo; - - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); - if (sinfo) - dump_space_info(sinfo, num_bytes, 1); - } - } - - trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); - - return ret; -} - -static int __btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len, int pin) -{ - struct btrfs_block_group_cache *cache; - int ret = 0; - - cache = btrfs_lookup_block_group(root->fs_info, start); - if (!cache) { - printk(KERN_ERR "Unable to find block group for %llu\n", - (unsigned long long)start); - return -ENOSPC; - } - - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_discard_extent(root, start, len, NULL); - - if (pin) - pin_down_extent(root, cache, start, len, 1); - else { - btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); - } - btrfs_put_block_group(cache); - - trace_btrfs_reserved_extent_free(root, start, len); - - return ret; -} - -int btrfs_free_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) -{ - return __btrfs_free_reserved_extent(root, start, len, 0); -} - -int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, - u64 start, u64 len) -{ - return __btrfs_free_reserved_extent(root, start, len, 1); -} - -static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) -{ - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_extent_item *extent_item; - struct btrfs_extent_inline_ref *iref; - struct btrfs_path *path; - struct extent_buffer *leaf; - int type; - u32 size; - - if (parent > 0) - type = BTRFS_SHARED_DATA_REF_KEY; - else - type = BTRFS_EXTENT_DATA_REF_KEY; - - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); - if (ret) { - btrfs_free_path(path); - return ret; - } - - leaf = path->nodes[0]; - extent_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, extent_item, ref_mod); - btrfs_set_extent_generation(leaf, extent_item, trans->transid); - btrfs_set_extent_flags(leaf, extent_item, - flags | BTRFS_EXTENT_FLAG_DATA); - - iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - btrfs_set_extent_inline_ref_type(leaf, iref, type); - if (parent > 0) { - struct btrfs_shared_data_ref *ref; - ref = (struct btrfs_shared_data_ref *)(iref + 1); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); - } else { - struct btrfs_extent_data_ref *ref; - ref = (struct btrfs_extent_data_ref *)(&iref->offset); - btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); - btrfs_set_extent_data_ref_objectid(leaf, ref, owner); - btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); - } - - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); - - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { /* -ENOENT, logic error */ - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); - BUG(); - } - return ret; -} - -static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins) -{ - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_extent_item *extent_item; - struct btrfs_tree_block_info *block_info; - struct btrfs_extent_inline_ref *iref; - struct btrfs_path *path; - struct extent_buffer *leaf; - u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, - ins, size); - if (ret) { - btrfs_free_path(path); - return ret; - } - - leaf = path->nodes[0]; - extent_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - btrfs_set_extent_refs(leaf, extent_item, 1); - btrfs_set_extent_generation(leaf, extent_item, trans->transid); - btrfs_set_extent_flags(leaf, extent_item, - flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); - - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); - if (parent > 0) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); - btrfs_set_extent_inline_ref_type(leaf, iref, - BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, parent); - } else { - btrfs_set_extent_inline_ref_type(leaf, iref, - BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); - } - - btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); - - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); - if (ret) { /* -ENOENT, logic error */ - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); - BUG(); - } - return ret; -} - -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) -{ - int ret; - - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - - ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, - ins->offset, 0, - root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); - return ret; -} - -/* - * this is used by the tree logging recovery code. It records that - * an extent has been allocated and makes sure to clear the free - * space cache bits as well - */ -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins) -{ - int ret; - struct btrfs_block_group_cache *block_group; - struct btrfs_caching_control *caching_ctl; - u64 start = ins->objectid; - u64 num_bytes = ins->offset; - - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, NULL, 0); - caching_ctl = get_caching_control(block_group); - - if (!caching_ctl) { - BUG_ON(!block_group_cache_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } else { - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - - start = caching_ctl->progress; - num_bytes = ins->objectid + ins->offset - - caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ - } - - mutex_unlock(&caching_ctl->mutex); - put_caching_control(caching_ctl); - } - - ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT); - BUG_ON(ret); /* logic error */ - btrfs_put_block_group(block_group); - ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, - 0, owner, offset, ins, 1); - return ret; -} - -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level) -{ - struct extent_buffer *buf; - - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); - if (!buf) - return ERR_PTR(-ENOMEM); - btrfs_set_header_generation(buf, trans->transid); - btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); - btrfs_tree_lock(buf); - clean_tree_block(trans, root, buf); - clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - - btrfs_set_lock_blocking(buf); - btrfs_set_buffer_uptodate(buf); - - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - /* - * we allow two log transactions at a time, use different - * EXENT bit to differentiate dirty pages. - */ - if (root->log_transid % 2 == 0) - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); - else - set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); - } else { - set_extent_dirty(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); - } - trans->blocks_used++; - /* this returns a buffer locked for blocking */ - return buf; -} - -static struct btrfs_block_rsv * -use_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize) -{ - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - int ret; - - block_rsv = get_block_rsv(trans, root); - - if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve. - */ - if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - return ERR_PTR(ret); - } else if (ret) { - return ERR_PTR(ret); - } - return block_rsv; - } - - ret = block_rsv_use_bytes(block_rsv, blocksize); - if (!ret) - return block_rsv; - if (ret) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - /*DEFAULT_RATELIMIT_BURST*/ 2); - if (__ratelimit(&_rs)) { - printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); - WARN_ON(1); - } - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); - if (!ret) { - return block_rsv; - } else if (ret && block_rsv != global_rsv) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - } - - return ERR_PTR(-ENOSPC); -} - -static void unuse_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u32 blocksize) -{ - block_rsv_add_bytes(block_rsv, blocksize, 0); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); -} - -/* - * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns the tree buffer or NULL. - */ -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 hint, u64 empty_size, int for_cow) -{ - struct btrfs_key ins; - struct btrfs_block_rsv *block_rsv; - struct extent_buffer *buf; - u64 flags = 0; - int ret; - - - block_rsv = use_block_rsv(trans, root, blocksize); - if (IS_ERR(block_rsv)) - return ERR_CAST(block_rsv); - - ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, - empty_size, hint, &ins, 0); - if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); - return ERR_PTR(ret); - } - - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); - BUG_ON(IS_ERR(buf)); /* -ENOMEM */ - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins.objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); /* -ENOMEM */ - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, - ins.offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, for_cow); - BUG_ON(ret); /* -ENOMEM */ - } - return buf; -} - -struct walk_control { - u64 refs[BTRFS_MAX_LEVEL]; - u64 flags[BTRFS_MAX_LEVEL]; - struct btrfs_key update_progress; - int stage; - int level; - int shared_level; - int update_ref; - int keep_locks; - int reada_slot; - int reada_count; - int for_reloc; -}; - -#define DROP_REFERENCE 1 -#define UPDATE_BACKREF 2 - -static noinline void reada_walk_down(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct walk_control *wc, - struct btrfs_path *path) -{ - u64 bytenr; - u64 generation; - u64 refs; - u64 flags; - u32 nritems; - u32 blocksize; - struct btrfs_key key; - struct extent_buffer *eb; - int ret; - int slot; - int nread = 0; - - if (path->slots[wc->level] < wc->reada_slot) { - wc->reada_count = wc->reada_count * 2 / 3; - wc->reada_count = max(wc->reada_count, 2); - } else { - wc->reada_count = wc->reada_count * 3 / 2; - wc->reada_count = min_t(int, wc->reada_count, - BTRFS_NODEPTRS_PER_BLOCK(root)); - } - - eb = path->nodes[wc->level]; - nritems = btrfs_header_nritems(eb); - blocksize = btrfs_level_size(root, wc->level - 1); - - for (slot = path->slots[wc->level]; slot < nritems; slot++) { - if (nread >= wc->reada_count) - break; - - cond_resched(); - bytenr = btrfs_node_blockptr(eb, slot); - generation = btrfs_node_ptr_generation(eb, slot); - - if (slot == path->slots[wc->level]) - goto reada; - - if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) - continue; - - /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); - /* We don't care about errors in readahead. */ - if (ret < 0) - continue; - BUG_ON(refs == 0); - - if (wc->stage == DROP_REFERENCE) { - if (refs == 1) - goto reada; - - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - if (!wc->update_ref || - generation <= root->root_key.offset) - continue; - btrfs_node_key_to_cpu(eb, &key, slot); - ret = btrfs_comp_cpu_keys(&key, - &wc->update_progress); - if (ret < 0) - continue; - } else { - if (wc->level == 1 && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - continue; - } -reada: - ret = readahead_tree_block(root, bytenr, blocksize, - generation); - if (ret) - break; - nread++; - } - wc->reada_slot = slot; -} - -/* - * hepler to process tree block while walking down the tree. - * - * when wc->stage == UPDATE_BACKREF, this function updates - * back refs for pointers in the block. - * - * NOTE: return value 1 means we should stop walking down. - */ -static noinline int walk_down_proc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc, int lookup_info) -{ - int level = wc->level; - struct extent_buffer *eb = path->nodes[level]; - u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; - int ret; - - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) - return 1; - - /* - * when reference count of tree block is 1, it won't increase - * again. once full backref flag is set, we never clear it. - */ - if (lookup_info && - ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { - BUG_ON(!path->locks[level]); - ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, - &wc->refs[level], - &wc->flags[level]); - BUG_ON(ret == -ENOMEM); - if (ret) - return ret; - BUG_ON(wc->refs[level] == 0); - } - - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level] > 1) - return 1; - - if (path->locks[level] && !wc->keep_locks) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; - } - return 0; - } - - /* wc->stage == UPDATE_BACKREF */ - if (!(wc->flags[level] & flag)) { - BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, root, eb->start, - eb->len, flag, 0); - BUG_ON(ret); /* -ENOMEM */ - wc->flags[level] |= flag; - } - - /* - * the block is shared by multiple trees, so it's not good to - * keep the tree lock - */ - if (path->locks[level] && level > 0) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; - } - return 0; -} - -/* - * hepler to process tree block pointer. - * - * when wc->stage == DROP_REFERENCE, this function checks - * reference count of the block pointed to. if the block - * is shared and we need update back refs for the subtree - * rooted at the block, this function changes wc->stage to - * UPDATE_BACKREF. if the block is shared and there is no - * need to update back, this function drops the reference - * to the block. - * - * NOTE: return value 1 means we should stop walking down. - */ -static noinline int do_walk_down(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc, int *lookup_info) -{ - u64 bytenr; - u64 generation; - u64 parent; - u32 blocksize; - struct btrfs_key key; - struct extent_buffer *next; - int level = wc->level; - int reada = 0; - int ret = 0; - - generation = btrfs_node_ptr_generation(path->nodes[level], - path->slots[level]); - /* - * if the lower level block was created before the snapshot - * was created, we know there is no need to update back refs - * for the subtree - */ - if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { - *lookup_info = 1; - return 1; - } - - bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); - - next = btrfs_find_tree_block(root, bytenr, blocksize); - if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); - if (!next) - return -ENOMEM; - reada = 1; - } - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &wc->refs[level - 1], - &wc->flags[level - 1]); - if (ret < 0) { - btrfs_tree_unlock(next); - return ret; - } - - BUG_ON(wc->refs[level - 1] == 0); - *lookup_info = 0; - - if (wc->stage == DROP_REFERENCE) { - if (wc->refs[level - 1] > 1) { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - - if (!wc->update_ref || - generation <= root->root_key.offset) - goto skip; - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); - if (ret < 0) - goto skip; - - wc->stage = UPDATE_BACKREF; - wc->shared_level = level - 1; - } - } else { - if (level == 1 && - (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - goto skip; - } - - if (!btrfs_buffer_uptodate(next, generation, 0)) { - btrfs_tree_unlock(next); - free_extent_buffer(next); - next = NULL; - *lookup_info = 1; - } - - if (!next) { - if (reada && level == 1) - reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, blocksize, generation); - if (!next) - return -EIO; - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - } - - level--; - BUG_ON(level != btrfs_header_level(next)); - path->nodes[level] = next; - path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - wc->level = level; - if (wc->level == 1) - wc->reada_slot = 0; - return 0; -skip: - wc->refs[level - 1] = 0; - wc->flags[level - 1] = 0; - if (wc->stage == DROP_REFERENCE) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level])); - parent = 0; - } - - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); - BUG_ON(ret); /* -ENOMEM */ - } - btrfs_tree_unlock(next); - free_extent_buffer(next); - *lookup_info = 1; - return 1; -} - -/* - * hepler to process tree block while walking up the tree. - * - * when wc->stage == DROP_REFERENCE, this function drops - * reference count on the block. - * - * when wc->stage == UPDATE_BACKREF, this function changes - * wc->stage back to DROP_REFERENCE if we changed wc->stage - * to UPDATE_BACKREF previously while processing the block. - * - * NOTE: return value 1 means we should stop walking up. - */ -static noinline int walk_up_proc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc) -{ - int ret; - int level = wc->level; - struct extent_buffer *eb = path->nodes[level]; - u64 parent = 0; - - if (wc->stage == UPDATE_BACKREF) { - BUG_ON(wc->shared_level < level); - if (level < wc->shared_level) - goto out; - - ret = find_next_key(path, level + 1, &wc->update_progress); - if (ret > 0) - wc->update_ref = 0; - - wc->stage = DROP_REFERENCE; - wc->shared_level = -1; - path->slots[level] = 0; - - /* - * check reference count again if the block isn't locked. - * we should start walking down the tree again if reference - * count is one. - */ - if (!path->locks[level]) { - BUG_ON(level == 0); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - - ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, - &wc->refs[level], - &wc->flags[level]); - if (ret < 0) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - return ret; - } - BUG_ON(wc->refs[level] == 0); - if (wc->refs[level] == 1) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - return 1; - } - } - } - - /* wc->stage == DROP_REFERENCE */ - BUG_ON(wc->refs[level] > 1 && !path->locks[level]); - - if (wc->refs[level] == 1) { - if (level == 0) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1, - wc->for_reloc); - else - ret = btrfs_dec_ref(trans, root, eb, 0, - wc->for_reloc); - BUG_ON(ret); /* -ENOMEM */ - } - /* make block locked assertion in clean_tree_block happy */ - if (!path->locks[level] && - btrfs_header_generation(eb) == trans->transid) { - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - } - clean_tree_block(trans, root, eb); - } - - if (eb == root->node) { - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - parent = eb->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(eb)); - } else { - if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - parent = path->nodes[level + 1]->start; - else - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level + 1])); - } - - btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0); -out: - wc->refs[level] = 0; - wc->flags[level] = 0; - return 0; -} - -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc) -{ - int level = wc->level; - int lookup_info = 1; - int ret; - - while (level >= 0) { - ret = walk_down_proc(trans, root, path, wc, lookup_info); - if (ret > 0) - break; - - if (level == 0) - break; - - if (path->slots[level] >= - btrfs_header_nritems(path->nodes[level])) - break; - - ret = do_walk_down(trans, root, path, wc, &lookup_info); - if (ret > 0) { - path->slots[level]++; - continue; - } else if (ret < 0) - return ret; - level = wc->level; - } - return 0; -} - -static noinline int walk_up_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct walk_control *wc, int max_level) -{ - int level = wc->level; - int ret; - - path->slots[level] = btrfs_header_nritems(path->nodes[level]); - while (level < max_level && path->nodes[level]) { - wc->level = level; - if (path->slots[level] + 1 < - btrfs_header_nritems(path->nodes[level])) { - path->slots[level]++; - return 0; - } else { - ret = walk_up_proc(trans, root, path, wc); - if (ret > 0) - return 0; - - if (path->locks[level]) { - btrfs_tree_unlock_rw(path->nodes[level], - path->locks[level]); - path->locks[level] = 0; - } - free_extent_buffer(path->nodes[level]); - path->nodes[level] = NULL; - level++; - } - } - return 1; -} - -/* - * drop a subvolume tree. - * - * this function traverses the tree freeing any blocks that only - * referenced by the tree. - * - * when a shared tree block is found. this function decreases its - * reference count by one. if update_ref is true, this function - * also make sure backrefs for the shared block and all lower level - * blocks are properly updated. - */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref, - int for_reloc) -{ - struct btrfs_path *path; - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = root->fs_info->tree_root; - struct btrfs_root_item *root_item = &root->root_item; - struct walk_control *wc; - struct btrfs_key key; - int err = 0; - int ret; - int level; - - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - - wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); - err = -ENOMEM; - goto out; - } - - trans = btrfs_start_transaction(tree_root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - - if (block_rsv) - trans->block_rsv = block_rsv; - - if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - level = btrfs_header_level(root->node); - path->nodes[level] = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(path->nodes[level]); - path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - memset(&wc->update_progress, 0, - sizeof(wc->update_progress)); - } else { - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - memcpy(&wc->update_progress, &key, - sizeof(wc->update_progress)); - - level = root_item->drop_level; - BUG_ON(level == 0); - path->lowest_level = level; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - path->lowest_level = 0; - if (ret < 0) { - err = ret; - goto out_end_trans; - } - WARN_ON(ret > 0); - - /* - * unlock our path, this is safe because only this - * function is allowed to delete this snapshot - */ - btrfs_unlock_up_safe(path, 0); - - level = btrfs_header_level(root->node); - while (1) { - btrfs_tree_lock(path->nodes[level]); - btrfs_set_lock_blocking(path->nodes[level]); - - ret = btrfs_lookup_extent_info(trans, root, - path->nodes[level]->start, - path->nodes[level]->len, - &wc->refs[level], - &wc->flags[level]); - if (ret < 0) { - err = ret; - goto out_end_trans; - } - BUG_ON(wc->refs[level] == 0); - - if (level == root_item->drop_level) - break; - - btrfs_tree_unlock(path->nodes[level]); - WARN_ON(wc->refs[level] != 1); - level--; - } - } - - wc->level = level; - wc->shared_level = -1; - wc->stage = DROP_REFERENCE; - wc->update_ref = update_ref; - wc->keep_locks = 0; - wc->for_reloc = for_reloc; - wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); - - while (1) { - ret = walk_down_tree(trans, root, path, wc); - if (ret < 0) { - err = ret; - break; - } - - ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); - if (ret < 0) { - err = ret; - break; - } - - if (ret > 0) { - BUG_ON(wc->stage != DROP_REFERENCE); - break; - } - - if (wc->stage == DROP_REFERENCE) { - level = wc->level; - btrfs_node_key(path->nodes[level], - &root_item->drop_progress, - path->slots[level]); - root_item->drop_level = level; - } - - BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans, tree_root)) { - ret = btrfs_update_root(trans, tree_root, - &root->root_key, - root_item); - if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); - err = ret; - goto out_end_trans; - } - - btrfs_end_transaction_throttle(trans, tree_root); - trans = btrfs_start_transaction(tree_root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - if (block_rsv) - trans->block_rsv = block_rsv; - } - } - btrfs_release_path(path); - if (err) - goto out_end_trans; - - ret = btrfs_del_root(trans, tree_root, &root->root_key); - if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); - goto out_end_trans; - } - - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_find_last_root(tree_root, root->root_key.objectid, - NULL, NULL); - if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); - err = ret; - goto out_end_trans; - } else if (ret > 0) { - /* if we fail to delete the orphan item this time - * around, it'll get picked up the next time. - * - * The most common failure here is just -ENOENT. - */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); - } - } - - if (root->in_radix) { - btrfs_free_fs_root(tree_root->fs_info, root); - } else { - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); - } -out_end_trans: - btrfs_end_transaction_throttle(trans, tree_root); -out_free: - kfree(wc); - btrfs_free_path(path); -out: - if (err) - btrfs_std_error(root->fs_info, err); - return err; -} - -/* - * drop subtree rooted at tree block 'node'. - * - * NOTE: this function will unlock and release tree block 'node' - * only used by relocation code - */ -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent) -{ - struct btrfs_path *path; - struct walk_control *wc; - int level; - int parent_level; - int ret = 0; - int wret; - - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); - return -ENOMEM; - } - - btrfs_assert_tree_locked(parent); - parent_level = btrfs_header_level(parent); - extent_buffer_get(parent); - path->nodes[parent_level] = parent; - path->slots[parent_level] = btrfs_header_nritems(parent); - - btrfs_assert_tree_locked(node); - level = btrfs_header_level(node); - path->nodes[level] = node; - path->slots[level] = 0; - path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - - wc->refs[parent_level] = 1; - wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; - wc->level = level; - wc->shared_level = -1; - wc->stage = DROP_REFERENCE; - wc->update_ref = 0; - wc->keep_locks = 1; - wc->for_reloc = 1; - wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); - - while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; - break; - } - - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) - break; - } - - kfree(wc); - btrfs_free_path(path); - return ret; -} - -static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) -{ - u64 num_devices; - u64 stripped; - - /* - * if restripe for this chunk_type is on pick target profile and - * return, otherwise do the usual balance - */ - stripped = get_restripe_target(root->fs_info, flags); - if (stripped) - return extended_to_chunk(stripped); - - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; - - stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; - - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; - } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; - - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; - - /* this is drive concat, leave it alone */ - } - - return flags; -} - -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - u64 min_allocable_bytes; - int ret = -ENOSPC; - - - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = 1 * 1024 * 1024; - else - min_allocable_bytes = 0; - - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - - if (cache->ro) { - ret = 0; - goto out; - } - - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - - if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + - min_allocable_bytes <= sinfo->total_bytes) { - sinfo->bytes_readonly += num_bytes; - cache->ro = 1; - ret = 0; - } -out: - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); - return ret; -} - -int btrfs_set_block_group_ro(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) - -{ - struct btrfs_trans_handle *trans; - u64 alloc_flags; - int ret; - - BUG_ON(cache->ro); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - } - - ret = set_block_group_ro(cache, 0); - if (!ret) - goto out; - alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - ret = set_block_group_ro(cache, 0); -out: - btrfs_end_transaction(trans, root); - return ret; -} - -int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) -{ - u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, - CHUNK_ALLOC_FORCE); -} - -/* - * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. - */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) -{ - struct btrfs_block_group_cache *block_group; - u64 free_bytes = 0; - int factor; - - list_for_each_entry(block_group, groups_list, list) { - spin_lock(&block_group->lock); - - if (!block_group->ro) { - spin_unlock(&block_group->lock); - continue; - } - - if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) - factor = 2; - else - factor = 1; - - free_bytes += (block_group->key.offset - - btrfs_block_group_used(&block_group->item)) * - factor; - - spin_unlock(&block_group->lock); - } - - return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - int i; - u64 free_bytes = 0; - - spin_lock(&sinfo->lock); - - for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) - if (!list_empty(&sinfo->block_groups[i])) - free_bytes += __btrfs_get_ro_block_group_free_space( - &sinfo->block_groups[i]); - - spin_unlock(&sinfo->lock); - - return free_bytes; -} - -void btrfs_set_block_group_rw(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - struct btrfs_space_info *sinfo = cache->space_info; - u64 num_bytes; - - BUG_ON(!cache->ro); - - spin_lock(&sinfo->lock); - spin_lock(&cache->lock); - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - cache->ro = 0; - spin_unlock(&cache->lock); - spin_unlock(&sinfo->lock); -} - -/* - * checks to see if its even possible to relocate this block group. - * - * @return - -1 if it's not a good idea to relocate this block group, 0 if its - * ok to go ahead and try. - */ -int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - struct btrfs_device *device; - u64 min_free; - u64 dev_min = 1; - u64 dev_nr = 0; - u64 target; - int index; - int full = 0; - int ret = 0; - - block_group = btrfs_lookup_block_group(root->fs_info, bytenr); - - /* odd, couldn't find the block group, leave it alone */ - if (!block_group) - return -1; - - min_free = btrfs_block_group_used(&block_group->item); - - /* no bytes used, we're good */ - if (!min_free) - goto out; - - space_info = block_group->space_info; - spin_lock(&space_info->lock); - - full = space_info->full; - - /* - * if this is the last block group we have in this space, we can't - * relocate it unless we're able to allocate a new chunk below. - * - * Otherwise, we need to make sure we have room in the space to handle - * all of the extents from this block group. If we can, we're good - */ - if ((space_info->total_bytes != block_group->key.offset) && - (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - min_free < space_info->total_bytes)) { - spin_unlock(&space_info->lock); - goto out; - } - spin_unlock(&space_info->lock); - - /* - * ok we don't have enough space, but maybe we have free space on our - * devices to allocate new chunks for relocation, so loop through our - * alloc devices and guess if we have enough space. if this block - * group is going to be restriped, run checks against the target - * profile instead of the current one. - */ - ret = -1; - - /* - * index: - * 0: raid10 - * 1: raid1 - * 2: dup - * 3: raid0 - * 4: single - */ - target = get_restripe_target(root->fs_info, block_group->flags); - if (target) { - index = __get_block_group_index(extended_to_chunk(target)); - } else { - /* - * this is just a balance, so if we were marked as full - * we know there is no space for a new chunk - */ - if (full) - goto out; - - index = get_block_group_index(block_group); - } - - if (index == 0) { - dev_min = 4; - /* Divide by 2 */ - min_free >>= 1; - } else if (index == 1) { - dev_min = 2; - } else if (index == 2) { - /* Multiply by 2 */ - min_free <<= 1; - } else if (index == 3) { - dev_min = fs_devices->rw_devices; - do_div(min_free, dev_min); - } - - mutex_lock(&root->fs_info->chunk_mutex); - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 dev_offset; - - /* - * check to make sure we can actually find a chunk with enough - * space to fit our block group in. - */ - if (device->total_bytes > device->bytes_used + min_free) { - ret = find_free_dev_extent(device, min_free, - &dev_offset, NULL); - if (!ret) - dev_nr++; - - if (dev_nr >= dev_min) - break; - - ret = -1; - } - } - mutex_unlock(&root->fs_info->chunk_mutex); -out: - btrfs_put_block_group(block_group); - return ret; -} - -static int find_first_block_group(struct btrfs_root *root, - struct btrfs_path *path, struct btrfs_key *key) -{ - int ret = 0; - struct btrfs_key found_key; - struct extent_buffer *leaf; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - if (found_key.objectid >= key->objectid && - found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; - goto out; - } - path->slots[0]++; - } -out: - return ret; -} - -void btrfs_put_block_group_cache(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - u64 last = 0; - - while (1) { - struct inode *inode; - - block_group = btrfs_lookup_first_block_group(info, last); - while (block_group) { - spin_lock(&block_group->lock); - if (block_group->iref) - break; - spin_unlock(&block_group->lock); - block_group = next_block_group(info->tree_root, - block_group); - } - if (!block_group) { - if (last == 0) - break; - last = 0; - continue; - } - - inode = block_group->inode; - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - last = block_group->key.objectid + block_group->key.offset; - btrfs_put_block_group(block_group); - } -} - -int btrfs_free_block_groups(struct btrfs_fs_info *info) -{ - struct btrfs_block_group_cache *block_group; - struct btrfs_space_info *space_info; - struct btrfs_caching_control *caching_ctl; - struct rb_node *n; - - down_write(&info->extent_commit_sem); - while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); - list_del(&caching_ctl->list); - put_caching_control(caching_ctl); - } - up_write(&info->extent_commit_sem); - - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { - block_group = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); - spin_unlock(&info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); - up_write(&block_group->space_info->groups_sem); - - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - - /* - * We haven't cached this block group, which means we could - * possibly have excluded extents on this block group. - */ - if (block_group->cached == BTRFS_CACHE_NO) - free_excluded_extents(info->extent_root, block_group); - - btrfs_remove_free_space_cache(block_group); - btrfs_put_block_group(block_group); - - spin_lock(&info->block_group_cache_lock); - } - spin_unlock(&info->block_group_cache_lock); - - /* now that all the block groups are freed, go through and - * free all the space_info structs. This is only called during - * the final stages of unmount, and so we know nobody is - * using them. We call synchronize_rcu() once before we start, - * just to be on the safe side. - */ - synchronize_rcu(); - - release_global_block_rsv(info); - - while(!list_empty(&info->space_info)) { - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0) { - WARN_ON(1); - dump_space_info(space_info, 0, 0); - } - list_del(&space_info->list); - kfree(space_info); - } - return 0; -} - -static void __link_block_group(struct btrfs_space_info *space_info, - struct btrfs_block_group_cache *cache) -{ - int index = get_block_group_index(cache); - - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups[index]); - up_write(&space_info->groups_sem); -} - -int btrfs_read_block_groups(struct btrfs_root *root) -{ - struct btrfs_path *path; - int ret; - struct btrfs_block_group_cache *cache; - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *space_info; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *leaf; - int need_clear = 0; - u64 cache_gen; - - root = info->extent_root; - key.objectid = 0; - key.offset = 0; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 1; - - cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && - btrfs_super_generation(root->fs_info->super_copy) != cache_gen) - need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) - need_clear = 1; - - while (1) { - ret = find_first_block_group(root, path, &key); - if (ret > 0) - break; - if (ret != 0) - goto error; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) { - ret = -ENOMEM; - goto error; - } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - - if (need_clear) - cache->disk_cache_state = BTRFS_DC_CLEAR; - - read_extent_buffer(leaf, &cache->item, - btrfs_item_ptr_offset(leaf, path->slots[0]), - sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); - - key.objectid = found_key.objectid + found_key.offset; - btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - - btrfs_init_free_space_ctl(cache); - - /* - * We need to exclude the super stripes now so that the space - * info has super bytes accounted for, otherwise we'll think - * we have more space than we actually do. - */ - exclude_super_stripes(root, cache); - - /* - * check for two cases, either we are full, and therefore - * don't need to bother with the caching work since we won't - * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _alot_ of - * time, particularly in the full case. - */ - if (found_key.offset == btrfs_block_group_used(&cache->item)) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - free_excluded_extents(root, cache); - } else if (btrfs_block_group_used(&cache->item) == 0) { - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, root->fs_info, - found_key.objectid, - found_key.objectid + - found_key.offset); - free_excluded_extents(root, cache); - } - - ret = update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - &space_info); - BUG_ON(ret); /* -ENOMEM */ - cache->space_info = space_info; - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); - - __link_block_group(space_info, cache); - - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); /* Logic error */ - - set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_ro(cache, 1); - } - - list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { - if (!(get_alloc_profile(root, space_info->flags) & - (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_DUP))) - continue; - /* - * avoid allocating from un-mirrored block group if there are - * mirrored block groups. - */ - list_for_each_entry(cache, &space_info->block_groups[3], list) - set_block_group_ro(cache, 1); - list_for_each_entry(cache, &space_info->block_groups[4], list) - set_block_group_ro(cache, 1); - } - - init_global_block_rsv(info); - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytes_used, - u64 type, u64 chunk_objectid, u64 chunk_offset, - u64 size) -{ - int ret; - struct btrfs_root *extent_root; - struct btrfs_block_group_cache *cache; - - extent_root = root->fs_info->extent_root; - - root->fs_info->last_trans_log_full_commit = trans->transid; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - - btrfs_init_free_space_ctl(cache); - - btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; - btrfs_set_block_group_flags(&cache->item, type); - - cache->last_byte_to_unpin = (u64)-1; - cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); - - add_new_free_space(cache, root->fs_info, chunk_offset, - chunk_offset + size); - - free_excluded_extents(root, cache); - - ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, - &cache->space_info); - BUG_ON(ret); /* -ENOMEM */ - update_global_block_rsv(root->fs_info); - - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); - - __link_block_group(cache->space_info, cache); - - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); /* Logic error */ - - ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, - sizeof(cache->item)); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - return ret; - } - - set_avail_alloc_bits(extent_root->fs_info, type); - - return 0; -} - -static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 extra_flags = chunk_to_extended(flags) & - BTRFS_EXTENDED_PROFILE_MASK; - - if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits &= ~extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits &= ~extra_flags; -} - -int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) -{ - struct btrfs_path *path; - struct btrfs_block_group_cache *block_group; - struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = root->fs_info->tree_root; - struct btrfs_key key; - struct inode *inode; - int ret; - int index; - int factor; - - root = root->fs_info->extent_root; - - block_group = btrfs_lookup_block_group(root->fs_info, group_start); - BUG_ON(!block_group); - BUG_ON(!block_group->ro); - - /* - * Free the reserved super bytes from this block group before - * remove it. - */ - free_excluded_extents(root, block_group); - - memcpy(&key, &block_group->key, sizeof(key)); - index = get_block_group_index(block_group); - if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - - /* make sure this block group isn't part of an allocation cluster */ - cluster = &root->fs_info->data_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); - - /* - * make sure this block group isn't part of a metadata - * allocation cluster - */ - cluster = &root->fs_info->meta_alloc_cluster; - spin_lock(&cluster->refill_lock); - btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&cluster->refill_lock); - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - inode = lookup_free_space_inode(tree_root, block_group, path); - if (!IS_ERR(inode)) { - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_add_delayed_iput(inode); - goto out; - } - clear_nlink(inode); - /* One for the block groups ref */ - spin_lock(&block_group->lock); - if (block_group->iref) { - block_group->iref = 0; - block_group->inode = NULL; - spin_unlock(&block_group->lock); - iput(inode); - } else { - spin_unlock(&block_group->lock); - } - /* One for our lookup ref */ - btrfs_add_delayed_iput(inode); - } - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = block_group->key.objectid; - key.type = 0; - - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0) - btrfs_release_path(path); - if (ret == 0) { - ret = btrfs_del_item(trans, tree_root, path); - if (ret) - goto out; - btrfs_release_path(path); - } - - spin_lock(&root->fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &root->fs_info->block_group_cache_tree); - spin_unlock(&root->fs_info->block_group_cache_lock); - - down_write(&block_group->space_info->groups_sem); - /* - * we must use list_del_init so people can check to see if they - * are still on the list after taking the semaphore - */ - list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) - clear_avail_alloc_bits(root->fs_info, block_group->flags); - up_write(&block_group->space_info->groups_sem); - - if (block_group->cached == BTRFS_CACHE_STARTED) - wait_block_group_cache_done(block_group); - - btrfs_remove_free_space_cache(block_group); - - spin_lock(&block_group->space_info->lock); - block_group->space_info->total_bytes -= block_group->key.offset; - block_group->space_info->bytes_readonly -= block_group->key.offset; - block_group->space_info->disk_total -= block_group->key.offset * factor; - spin_unlock(&block_group->space_info->lock); - - memcpy(&key, &block_group->key, sizeof(key)); - - btrfs_clear_space_info_full(root->fs_info); - - btrfs_put_block_group(block_group); - btrfs_put_block_group(block_group); - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -EIO; - if (ret < 0) - goto out; - - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_init_space_info(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - struct btrfs_super_block *disk_super; - u64 features; - u64 flags; - int mixed = 0; - int ret; - - disk_super = fs_info->super_copy; - if (!btrfs_super_root(disk_super)) - return 1; - - features = btrfs_super_incompat_flags(disk_super); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; - - flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); - if (ret) - goto out; - - if (mixed) { - flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); - } else { - flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); - if (ret) - goto out; - - flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); - } -out: - return ret; -} - -int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) -{ - return unpin_extent_range(root, start, end); -} - -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) -{ - return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); -} - -int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache = NULL; - u64 group_trimmed; - u64 start; - u64 end; - u64 trimmed = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); - int ret = 0; - - /* - * try to trim all FS space, our block group may start from non-zero. - */ - if (range->len == total_bytes) - cache = btrfs_lookup_first_block_group(fs_info, range->start); - else - cache = btrfs_lookup_block_group(fs_info, range->start); - - while (cache) { - if (cache->key.objectid >= (range->start + range->len)) { - btrfs_put_block_group(cache); - break; - } - - start = max(range->start, cache->key.objectid); - end = min(range->start + range->len, - cache->key.objectid + cache->key.offset); - - if (end - start >= range->minlen) { - if (!block_group_cache_done(cache)) { - ret = cache_block_group(cache, NULL, root, 0); - if (!ret) - wait_block_group_cache_done(cache); - } - ret = btrfs_trim_block_group(cache, - &group_trimmed, - start, - end, - range->minlen); - - trimmed += group_trimmed; - if (ret) { - btrfs_put_block_group(cache); - break; - } - } - - cache = next_block_group(fs_info->tree_root, cache); - } - - range->len = trimmed; - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/extent_io.c b/ANDROID_3.4.5/fs/btrfs/extent_io.c deleted file mode 100644 index c9018a05..00000000 --- a/ANDROID_3.4.5/fs/btrfs/extent_io.c +++ /dev/null @@ -1,4891 +0,0 @@ -#include <linux/bitops.h> -#include <linux/slab.h> -#include <linux/bio.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/page-flags.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/blkdev.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/pagevec.h> -#include <linux/prefetch.h> -#include <linux/cleancache.h> -#include "extent_io.h" -#include "extent_map.h" -#include "compat.h" -#include "ctree.h" -#include "btrfs_inode.h" -#include "volumes.h" -#include "check-integrity.h" -#include "locking.h" - -static struct kmem_cache *extent_state_cache; -static struct kmem_cache *extent_buffer_cache; - -static LIST_HEAD(buffers); -static LIST_HEAD(states); - -#define LEAK_DEBUG 0 -#if LEAK_DEBUG -static DEFINE_SPINLOCK(leak_lock); -#endif - -#define BUFFER_LRU_MAX 64 - -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; - -struct extent_page_data { - struct bio *bio; - struct extent_io_tree *tree; - get_extent_t *get_extent; - - /* tells writepage not to lock the state bits for this range - * it still does the unlocking - */ - unsigned int extent_locked:1; - - /* tells the submit_bio code to use a WRITE_SYNC */ - unsigned int sync_io:1; -}; - -static noinline void flush_write_bio(void *data); -static inline struct btrfs_fs_info * -tree_fs_info(struct extent_io_tree *tree) -{ - return btrfs_sb(tree->mapping->host->i_sb); -} - -int __init extent_io_init(void) -{ - extent_state_cache = kmem_cache_create("extent_state", - sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!extent_state_cache) - return -ENOMEM; - - extent_buffer_cache = kmem_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!extent_buffer_cache) - goto free_state_cache; - return 0; - -free_state_cache: - kmem_cache_destroy(extent_state_cache); - return -ENOMEM; -} - -void extent_io_exit(void) -{ - struct extent_state *state; - struct extent_buffer *eb; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - - } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } - if (extent_state_cache) - kmem_cache_destroy(extent_state_cache); - if (extent_buffer_cache) - kmem_cache_destroy(extent_buffer_cache); -} - -void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping) -{ - tree->state = RB_ROOT; - INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); - tree->ops = NULL; - tree->dirty_bytes = 0; - spin_lock_init(&tree->lock); - spin_lock_init(&tree->buffer_lock); - tree->mapping = mapping; -} - -static struct extent_state *alloc_extent_state(gfp_t mask) -{ - struct extent_state *state; -#if LEAK_DEBUG - unsigned long flags; -#endif - - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; - state->state = 0; - state->private = 0; - state->tree = NULL; -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&state->leak_list, &states); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - atomic_set(&state->refs, 1); - init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); - return state; -} - -void free_extent_state(struct extent_state *state) -{ - if (!state) - return; - if (atomic_dec_and_test(&state->refs)) { -#if LEAK_DEBUG - unsigned long flags; -#endif - WARN_ON(state->tree); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_del(&state->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - trace_free_extent_state(state, _RET_IP_); - kmem_cache_free(extent_state_cache, state); - } -} - -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - - entry = rb_entry(node, struct tree_entry, rb_node); - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) -{ - struct rb_root *root = &tree->state; - struct rb_node *n = root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - prev = n; - prev_entry = entry; - - if (offset < entry->start) - n = n->rb_left; - else if (offset > entry->end) - n = n->rb_right; - else - return n; - } - - if (prev_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - prev = orig_prev; - } - - if (next_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - } - return NULL; -} - -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) -{ - struct rb_node *prev = NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &prev, NULL); - if (!ret) - return prev; - return ret; -} - -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, - struct extent_state *other) -{ - if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->mapping->host, new, - other); -} - -/* - * utility function to look for merge candidates inside a given range. - * Any extents with matching state are merged together into a single - * extent in the tree. Extents with EXTENT_IO in their state field - * are not merged because the end_io handlers need to be able to do - * operations on them without sleeping (or doing allocations/splits). - * - * This should be called with the tree lock held. - */ -static void merge_state(struct extent_io_tree *tree, - struct extent_state *state) -{ - struct extent_state *other; - struct rb_node *other_node; - - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) - return; - - other_node = rb_prev(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->end == state->start - 1 && - other->state == state->state) { - merge_cb(tree, state, other); - state->start = other->start; - other->tree = NULL; - rb_erase(&other->rb_node, &tree->state); - free_extent_state(other); - } - } - other_node = rb_next(&state->rb_node); - if (other_node) { - other = rb_entry(other_node, struct extent_state, rb_node); - if (other->start == state->end + 1 && - other->state == state->state) { - merge_cb(tree, state, other); - state->end = other->end; - other->tree = NULL; - rb_erase(&other->rb_node, &tree->state); - free_extent_state(other); - } - } -} - -static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) -{ - if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->mapping->host, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) -{ - if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->mapping->host, state, bits); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, int *bits); - -/* - * insert an extent_state struct into the tree. 'bits' are set on the - * struct before it is inserted. - * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. - * - * The tree lock is not taken internally. This is a utility function and - * probably isn't what you want to call (see set/clear_extent_bit). - */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, - int *bits) -{ - struct rb_node *node; - - if (end < start) { - printk(KERN_ERR "btrfs end < start %llu %llu\n", - (unsigned long long)end, - (unsigned long long)start); - WARN_ON(1); - } - state->start = start; - state->end = end; - - set_state_bits(tree, state, bits); - - node = tree_insert(&tree->state, end, &state->rb_node); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - printk(KERN_ERR "btrfs found node %llu %llu on insert of " - "%llu %llu\n", (unsigned long long)found->start, - (unsigned long long)found->end, - (unsigned long long)start, (unsigned long long)end); - return -EEXIST; - } - state->tree = tree; - merge_state(tree, state); - return 0; -} - -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, - u64 split) -{ - if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->mapping->host, orig, split); -} - -/* - * split a given extent state struct in two, inserting the preallocated - * struct 'prealloc' as the newly created second half. 'split' indicates an - * offset inside 'orig' where it should be split. - * - * Before calling, - * the tree has 'orig' at [orig->start, orig->end]. After calling, there - * are two extent state structs in the tree: - * prealloc: [orig->start, split - 1] - * orig: [ split, orig->end ] - * - * The tree locks are not taken by this function. They need to be held - * by the caller. - */ -static int split_state(struct extent_io_tree *tree, struct extent_state *orig, - struct extent_state *prealloc, u64 split) -{ - struct rb_node *node; - - split_cb(tree, orig, split); - - prealloc->start = orig->start; - prealloc->end = split - 1; - prealloc->state = orig->state; - orig->start = split; - - node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); - if (node) { - free_extent_state(prealloc); - return -EEXIST; - } - prealloc->tree = tree; - return 0; -} - -static struct extent_state *next_state(struct extent_state *state) -{ - struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; -} - -/* - * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1) - * - * If no bits are set on the state struct after clearing things, the - * struct is freed and removed from the tree - */ -static struct extent_state *clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - int *bits, int wake) -{ - struct extent_state *next; - int bits_to_clear = *bits & ~EXTENT_CTLBITS; - - if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - WARN_ON(range > tree->dirty_bytes); - tree->dirty_bytes -= range; - } - clear_state_cb(tree, state, bits); - state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); - if (state->state == 0) { - next = next_state(state); - if (state->tree) { - rb_erase(&state->rb_node, &tree->state); - state->tree = NULL; - free_extent_state(state); - } else { - WARN_ON(1); - } - } else { - merge_state(tree, state); - next = next_state(state); - } - return next; -} - -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) -{ - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); - - return prealloc; -} - -void extent_io_tree_panic(struct extent_io_tree *tree, int err) -{ - btrfs_panic(tree_fs_info(tree), err, "Locking error: " - "Extent tree was modified by another " - "thread while locked."); -} - -/* - * clear some bits on a range in the tree. This may require splitting - * or inserting elements in the tree, so the gfp mask is used to - * indicate which allocations or sleeping are allowed. - * - * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove - * the given range from the tree regardless of state (ie for truncate). - * - * the range [start, end] is inclusive. - * - * This takes the tree lock, and returns 0 on success and < 0 on error. - */ -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask) -{ - struct extent_state *state; - struct extent_state *cached; - struct extent_state *prealloc = NULL; - struct rb_node *node; - u64 last_end; - int err; - int clear = 0; - - if (delete) - bits |= ~EXTENT_CTLBITS; - bits |= EXTENT_FIRST_DELALLOC; - - if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) - clear = 1; -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; - } - - spin_lock(&tree->lock); - if (cached_state) { - cached = *cached_state; - - if (clear) { - *cached_state = NULL; - cached_state = NULL; - } - - if (cached && cached->tree && cached->start <= start && - cached->end > start) { - if (clear) - atomic_dec(&cached->refs); - state = cached; - goto hit_next; - } - if (clear) - free_extent_state(cached); - } - /* - * this search will find the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - goto out; - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - if (state->start > end) - goto out; - WARN_ON(state->end < start); - last_end = state->end; - - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) { - state = next_state(state); - goto next; - } - - /* - * | ---- desired range ---- | - * | state | or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip - * bits on second half. - * - * If the extent we found extends past our range, we - * just split and search again. It'll get split again - * the next time though. - * - * If the extent we found is inside our range, we clear - * the desired bit on it. - */ - - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - clear_state_bit(tree, state, &bits, wake); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and clear the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - if (wake) - wake_up(&state->wq); - - clear_state_bit(tree, prealloc, &bits, wake); - - prealloc = NULL; - goto out; - } - - state = clear_state_bit(tree, state, &bits, wake); -next: - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - if (start <= end && state && !need_resched()) - goto hit_next; - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} - -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - -/* - * waits for one or more bits to clear on a range in the state tree. - * The range [start, end] is inclusive. - * The tree lock is taken by this function - */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) -{ - struct extent_state *state; - struct rb_node *node; - - spin_lock(&tree->lock); -again: - while (1) { - /* - * this search will find all the extents that end after - * our range starts - */ - node = tree_search(tree, start); - if (!node) - break; - - state = rb_entry(node, struct extent_state, rb_node); - - if (state->start > end) - goto out; - - if (state->state & bits) { - start = state->start; - atomic_inc(&state->refs); - wait_on_state(tree, state); - free_extent_state(state); - goto again; - } - start = state->end + 1; - - if (start > end) - break; - - cond_resched_lock(&tree->lock); - } -out: - spin_unlock(&tree->lock); -} - -static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, - int *bits) -{ - int bits_to_set = *bits & ~EXTENT_CTLBITS; - - set_state_cb(tree, state, bits); - if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { - u64 range = state->end - state->start + 1; - tree->dirty_bytes += range; - } - state->state |= bits_to_set; -} - -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) -{ - if (cached_ptr && !(*cached_ptr)) { - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { - *cached_ptr = state; - atomic_inc(&state->refs); - } - } -} - -static void uncache_state(struct extent_state **cached_ptr) -{ - if (cached_ptr && (*cached_ptr)) { - struct extent_state *state = *cached_ptr; - *cached_ptr = NULL; - free_extent_state(state); - } -} - -/* - * set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. - * - * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. - * - * [start, end] is inclusive This takes the tree lock. - */ - -static int __must_check -__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - int err = 0; - u64 last_start; - u64 last_end; - - bits |= EXTENT_FIRST_DELALLOC; -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); - } - - spin_lock(&tree->lock); - if (cached_state && *cached_state) { - state = *cached_state; - if (state->start <= start && state->end > start && - state->tree) { - node = &state->rb_node; - goto hit_next; - } - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, &bits); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - struct rb_node *next_node; - if (state->state & exclusive_bits) { - *failed_start = state->start; - err = -EEXIST; - goto out; - } - - set_state_bits(tree, state, &bits); - - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, &bits); - cache_state(state, cached_state); - merge_state(tree, state); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - err = insert_state(tree, prealloc, start, this_end, - &bits); - if (err) - extent_io_tree_panic(tree, err); - - cache_state(prealloc, cached_state); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - if (state->state & exclusive_bits) { - *failed_start = start; - err = -EEXIST; - goto out; - } - - prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, &bits); - cache_state(prealloc, cached_state); - merge_state(tree, prealloc); - prealloc = NULL; - goto out; - } - - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} - -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, - u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) -{ - return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask); -} - - -/** - * convert_extent - convert all bits in a given range from one bit to another - * @tree: the io tree to search - * @start: the start offset in bytes - * @end: the end offset in bytes (inclusive) - * @bits: the bits to set in this range - * @clear_bits: the bits to clear in this range - * @mask: the allocation mask - * - * This will go through and set bits for the given range. If any states exist - * already in this range they are set with the given bit and cleared of the - * clear_bits. This is only meant to be used by things that are mergeable, ie - * converting from say DELALLOC to DIRTY. This is not meant to be used with - * boundary bits like LOCK. - */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask) -{ - struct extent_state *state; - struct extent_state *prealloc = NULL; - struct rb_node *node; - int err = 0; - u64 last_start; - u64 last_end; - -again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; - } - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = insert_state(tree, prealloc, start, end, &bits); - prealloc = NULL; - if (err) - extent_io_tree_panic(tree, err); - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); -hit_next: - last_start = state->start; - last_end = state->end; - - /* - * | ---- desired range ---- | - * | state | - * - * Just lock what we found and keep going - */ - if (state->start == start && state->end <= end) { - struct rb_node *next_node; - - set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); - if (last_end == (u64)-1) - goto out; - - start = last_end + 1; - next_node = rb_next(&state->rb_node); - if (next_node && start < end && prealloc && !need_resched()) { - state = rb_entry(next_node, struct extent_state, - rb_node); - if (state->start == start) - goto hit_next; - } - goto search_again; - } - - /* - * | ---- desired range ---- | - * | state | - * or - * | ------------- state -------------- | - * - * We need to split the extent we found, and may flip bits on - * second half. - * - * If the extent we found extends past our - * range, we just split and search again. It'll get split - * again the next time though. - * - * If the extent we found is inside our range, we set the - * desired bit on it. - */ - if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - if (err) - goto out; - if (state->end <= end) { - set_state_bits(tree, state, &bits); - clear_state_bit(tree, state, &clear_bits, 0); - if (last_end == (u64)-1) - goto out; - start = last_end + 1; - } - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | or | state | - * - * There's a hole, we need to insert something in it and - * ignore the extent we found. - */ - if (state->start > start) { - u64 this_end; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - /* - * Avoid to free 'prealloc' if it can be merged with - * the later extent. - */ - err = insert_state(tree, prealloc, start, this_end, - &bits); - if (err) - extent_io_tree_panic(tree, err); - prealloc = NULL; - start = this_end + 1; - goto search_again; - } - /* - * | ---- desired range ---- | - * | state | - * We need to split the extent, and set the bit - * on the first half - */ - if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) { - err = -ENOMEM; - goto out; - } - - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, err); - - set_state_bits(tree, prealloc, &bits); - clear_state_bit(tree, prealloc, &clear_bits, 0); - prealloc = NULL; - goto out; - } - - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) - cond_resched(); - goto again; -} - -/* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) -{ - return set_extent_bit(tree, start, end, bits, NULL, - NULL, mask); -} - -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); -} - -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); -} - -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); -} - -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, - NULL, mask); -} - -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, - cached_state, mask); -} - -static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); -} - -/* - * either insert or lock state struct between start and end use mask to tell - * us if waiting is desired. - */ -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state) -{ - int err; - u64 failed_start; - while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS); - if (err == -EEXIST) { - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); - start = failed_start; - } else - break; - WARN_ON(start > end); - } - return err; -} - -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, 0, NULL); -} - -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - int err; - u64 failed_start; - - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS); - if (err == -EEXIST) { - if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); - return 0; - } - return 1; -} - -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); -} - -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); -} - -/* - * helper function to set both pages and extents in the tree writeback - */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); - page_cache_release(page); - index++; - } - return 0; -} - -/* find the first state struct with 'bits' set after 'start', and - * return it. tree->lock must be held. NULL will returned if - * nothing was found after 'start' - */ -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits) -{ - struct rb_node *node; - struct extent_state *state; - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->end >= start && (state->state & bits)) - return state; - - node = rb_next(node); - if (!node) - break; - } -out: - return NULL; -} - -/* - * find the first offset in the io tree with 'bits' set. zero is - * returned if we find something, and *start_ret and *end_ret are - * set to reflect the state struct that was found. - * - * If nothing was found, 1 is returned, < 0 on error - */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits) -{ - struct extent_state *state; - int ret = 1; - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, bits); - if (state) { - *start_ret = state->start; - *end_ret = state->end; - ret = 0; - } - spin_unlock(&tree->lock); - return ret; -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * 1 is returned if we find something, 0 if nothing was in the tree - */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes, - struct extent_state **cached_state) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 found = 0; - u64 total_bytes = 0; - - spin_lock(&tree->lock); - - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) { - if (!found) - *end = (u64)-1; - goto out; - } - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (found && (state->start != cur_start || - (state->state & EXTENT_BOUNDARY))) { - goto out; - } - if (!(state->state & EXTENT_DELALLOC)) { - if (!found) - *end = state->end; - goto out; - } - if (!found) { - *start = state->start; - *cached_state = state; - atomic_inc(&state->refs); - } - found++; - *end = state->end; - cur_start = state->end + 1; - node = rb_next(node); - if (!node) - break; - total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) - break; - } -out: - spin_unlock(&tree->lock); - return found; -} - -static noinline void __unlock_for_delalloc(struct inode *inode, - struct page *locked_page, - u64 start, u64 end) -{ - int ret; - struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - unsigned long nr_pages = end_index - index + 1; - int i; - - if (index == locked_page->index && end_index == index) - return; - - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); - for (i = 0; i < ret; i++) { - if (pages[i] != locked_page) - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - nr_pages -= ret; - index += ret; - cond_resched(); - } -} - -static noinline int lock_delalloc_pages(struct inode *inode, - struct page *locked_page, - u64 delalloc_start, - u64 delalloc_end) -{ - unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; - unsigned long start_index = index; - unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; - unsigned long pages_locked = 0; - struct page *pages[16]; - unsigned long nrpages; - int ret; - int i; - - /* the caller is responsible for locking the start index */ - if (index == locked_page->index && index == end_index) - return 0; - - /* skip the page at the start index */ - nrpages = end_index - index + 1; - while (nrpages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nrpages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - ret = -EAGAIN; - goto done; - } - /* now we have an array of pages, lock them all */ - for (i = 0; i < ret; i++) { - /* - * the caller is taking responsibility for - * locked_page - */ - if (pages[i] != locked_page) { - lock_page(pages[i]); - if (!PageDirty(pages[i]) || - pages[i]->mapping != inode->i_mapping) { - ret = -EAGAIN; - unlock_page(pages[i]); - page_cache_release(pages[i]); - goto done; - } - } - page_cache_release(pages[i]); - pages_locked++; - } - nrpages -= ret; - index += ret; - cond_resched(); - } - ret = 0; -done: - if (ret && pages_locked) { - __unlock_for_delalloc(inode, locked_page, - delalloc_start, - ((u64)(start_index + pages_locked - 1)) << - PAGE_CACHE_SHIFT); - } - return ret; -} - -/* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, - * - * 1 is returned if we find something, 0 if nothing was in the tree - */ -static noinline u64 find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, - u64 *start, u64 *end, - u64 max_bytes) -{ - u64 delalloc_start; - u64 delalloc_end; - u64 found; - struct extent_state *cached_state = NULL; - int ret; - int loops = 0; - -again: - /* step one, find a bunch of delalloc bytes starting at start */ - delalloc_start = *start; - delalloc_end = 0; - found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, - max_bytes, &cached_state); - if (!found || delalloc_end <= *start) { - *start = delalloc_start; - *end = delalloc_end; - free_extent_state(cached_state); - return found; - } - - /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page - */ - if (delalloc_start < *start) - delalloc_start = *start; - - /* - * make sure to limit the number of pages we try to lock down - * if we're looping. - */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) - delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; - - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, - delalloc_start, delalloc_end); - if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by - * shortening the size of the delalloc range we're searching - */ - free_extent_state(cached_state); - if (!loops) { - unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); - max_bytes = PAGE_CACHE_SIZE - offset; - loops = 1; - goto again; - } else { - found = 0; - goto out_failed; - } - } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ - - /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); - - /* then test to make sure it is all still delalloc */ - ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1, cached_state); - if (!ret) { - unlock_extent_cached(tree, delalloc_start, delalloc_end, - &cached_state, GFP_NOFS); - __unlock_for_delalloc(inode, locked_page, - delalloc_start, delalloc_end); - cond_resched(); - goto again; - } - free_extent_state(cached_state); - *start = delalloc_start; - *end = delalloc_end; -out_failed: - return found; -} - -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op) -{ - int ret; - struct page *pages[16]; - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - unsigned long nr_pages = end_index - index + 1; - int i; - int clear_bits = 0; - - if (op & EXTENT_CLEAR_UNLOCK) - clear_bits |= EXTENT_LOCKED; - if (op & EXTENT_CLEAR_DIRTY) - clear_bits |= EXTENT_DIRTY; - - if (op & EXTENT_CLEAR_DELALLOC) - clear_bits |= EXTENT_DELALLOC; - - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | - EXTENT_SET_PRIVATE2))) - return 0; - - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - for (i = 0; i < ret; i++) { - - if (op & EXTENT_SET_PRIVATE2) - SetPagePrivate2(pages[i]); - - if (pages[i] == locked_page) { - page_cache_release(pages[i]); - continue; - } - if (op & EXTENT_CLEAR_DIRTY) - clear_page_dirty_for_io(pages[i]); - if (op & EXTENT_SET_WRITEBACK) - set_page_writeback(pages[i]); - if (op & EXTENT_END_WRITEBACK) - end_page_writeback(pages[i]); - if (op & EXTENT_CLEAR_UNLOCK_PAGE) - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - nr_pages -= ret; - index += ret; - cond_resched(); - } - return 0; -} - -/* - * count the number of bytes in the tree that have a given bit(s) - * set. This can be fairly slow, except for EXTENT_DIRTY which is - * cached. The total number found is returned. - */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits, int contig) -{ - struct rb_node *node; - struct extent_state *state; - u64 cur_start = *start; - u64 total_bytes = 0; - u64 last = 0; - int found = 0; - - if (search_end <= cur_start) { - WARN_ON(1); - return 0; - } - - spin_lock(&tree->lock); - if (cur_start == 0 && bits == EXTENT_DIRTY) { - total_bytes = tree->dirty_bytes; - goto out; - } - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, cur_start); - if (!node) - goto out; - - while (1) { - state = rb_entry(node, struct extent_state, rb_node); - if (state->start > search_end) - break; - if (contig && found && state->start > last + 1) - break; - if (state->end >= cur_start && (state->state & bits) == bits) { - total_bytes += min(search_end, state->end) + 1 - - max(cur_start, state->start); - if (total_bytes >= max_bytes) - break; - if (!found) { - *start = max(cur_start, state->start); - found = 1; - } - last = state->end; - } else if (contig && found) { - break; - } - node = rb_next(node); - if (!node) - break; - } -out: - spin_unlock(&tree->lock); - return total_bytes; -} - -/* - * set the private field for a given byte offset in the tree. If there isn't - * an extent_state there already, this does nothing. - */ -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - state->private = private; -out: - spin_unlock(&tree->lock); - return ret; -} - -int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) -{ - struct rb_node *node; - struct extent_state *state; - int ret = 0; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - if (!node) { - ret = -ENOENT; - goto out; - } - state = rb_entry(node, struct extent_state, rb_node); - if (state->start != start) { - ret = -ENOENT; - goto out; - } - *private = state->private; -out: - spin_unlock(&tree->lock); - return ret; -} - -/* - * searches a range in the state tree for a given mask. - * If 'filled' == 1, this returns 1 only if every extent in the tree - * has the bits set. Otherwise, 1 is returned if any bit in the - * range is found set. - */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached) -{ - struct extent_state *state = NULL; - struct rb_node *node; - int bitset = 0; - - spin_lock(&tree->lock); - if (cached && cached->tree && cached->start <= start && - cached->end > start) - node = &cached->rb_node; - else - node = tree_search(tree, start); - while (node && start <= end) { - state = rb_entry(node, struct extent_state, rb_node); - - if (filled && state->start > start) { - bitset = 0; - break; - } - - if (state->start > end) - break; - - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; - break; - } - - if (state->end == (u64)-1) - break; - - start = state->end + 1; - if (start > end) - break; - node = rb_next(node); - if (!node) { - if (filled) - bitset = 0; - break; - } - } - spin_unlock(&tree->lock); - return bitset; -} - -/* - * helper function to set a given page up to date if all the - * extents in the tree for that page are up to date - */ -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) - SetPageUptodate(page); -} - -/* - * helper function to unlock a page if all the extents in the tree - * for that page are unlocked - */ -static void check_page_locked(struct extent_io_tree *tree, struct page *page) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) - unlock_page(page); -} - -/* - * helper function to end page writeback if all the extents - * in the tree for that page are done with writeback - */ -static void check_page_writeback(struct extent_io_tree *tree, - struct page *page) -{ - end_page_writeback(page); -} - -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int this_mirror; - int failed_mirror; - int in_validation; -}; - -static int free_io_failure(struct inode *inode, struct io_failure_record *rec, - int did_repair) -{ - int ret; - int err = 0; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - - set_state_private(failure_tree, rec->start, 0); - ret = clear_extent_bits(failure_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - if (ret) - err = ret; - - if (did_repair) { - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, - rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); - if (ret && !err) - err = ret; - } - - kfree(rec); - return err; -} - -static void repair_io_failure_callback(struct bio *bio, int err) -{ - complete(bio->bi_private); -} - -/* - * this bypasses the standard btrfs submit functions deliberately, as - * the standard behavior is to write all copies in a raid setup. here we only - * want to write the one bad copy. so we do the mapping for ourselves and issue - * submit_bio directly. - * to avoid any synchonization issues, wait for the data after writing, which - * actually prevents the read that triggered the error from finishing. - * currently, there can be no more than two copies of every data bit. thus, - * exactly one rewrite is required. - */ -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, - u64 length, u64 logical, struct page *page, - int mirror_num) -{ - struct bio *bio; - struct btrfs_device *dev; - DECLARE_COMPLETION_ONSTACK(compl); - u64 map_length = 0; - u64 sector; - struct btrfs_bio *bbio = NULL; - int ret; - - BUG_ON(!mirror_num); - - bio = bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; - bio->bi_private = &compl; - bio->bi_end_io = repair_io_failure_callback; - bio->bi_size = 0; - map_length = length; - - ret = btrfs_map_block(map_tree, WRITE, logical, - &map_length, &bbio, mirror_num); - if (ret) { - bio_put(bio); - return -EIO; - } - BUG_ON(mirror_num != bbio->mirror_num); - sector = bbio->stripes[mirror_num-1].physical >> 9; - bio->bi_sector = sector; - dev = bbio->stripes[mirror_num-1].dev; - kfree(bbio); - if (!dev || !dev->bdev || !dev->writeable) { - bio_put(bio); - return -EIO; - } - bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start-page_offset(page)); - btrfsic_submit_bio(WRITE_SYNC, bio); - wait_for_completion(&compl); - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { - /* try to remap that extent elsewhere? */ - bio_put(bio); - return -EIO; - } - - printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " - "sector %llu)\n", page->mapping->host->i_ino, start, - dev->name, sector); - - bio_put(bio); - return 0; -} - -int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, - int mirror_num) -{ - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - u64 start = eb->start; - unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); - int ret = 0; - - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, - start, p, mirror_num); - if (ret) - break; - start += PAGE_CACHE_SIZE; - } - - return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int clean_io_failure(u64 start, struct page *page) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failrec; - struct btrfs_mapping_tree *map_tree; - struct extent_state *state; - int num_copies; - int did_repair = 0; - int ret; - struct inode *inode = page->mapping->host; - - private = 0; - ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0); - if (!ret) - return 0; - - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, - &private_failure); - if (ret) - return 0; - - failrec = (struct io_failure_record *)(unsigned long) private_failure; - BUG_ON(!failrec->this_mirror); - - if (failrec->in_validation) { - /* there was no real error, just free the record */ - pr_debug("clean_io_failure: freeing dummy error at %llu\n", - failrec->start); - did_repair = 1; - goto out; - } - - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - - if (state && state->start == failrec->start) { - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - num_copies = btrfs_num_copies(map_tree, failrec->logical, - failrec->len); - if (num_copies > 1) { - ret = repair_io_failure(map_tree, start, failrec->len, - failrec->logical, page, - failrec->failed_mirror); - did_repair = !ret; - } - } - -out: - if (!ret) - ret = free_io_failure(inode, failrec, did_repair); - - return ret; -} - -/* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed - */ - -static int bio_readpage_error(struct bio *failed_bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state) -{ - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; - struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int read_mode; - u64 logical; - - BUG_ON(failed_bio->bi_rw & REQ_WRITE); - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->this_mirror = 0; - failrec->bio_flags = 0; - failrec->in_validation = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return -EIO; - } - - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - - if (!em || IS_ERR(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " - "len=%llu\n", logical, start, failrec->len); - failrec->logical = logical; - free_extent_map(em); - - /* set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - if (ret >= 0) - ret = set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - /* set the bits in the inode's tree */ - if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, - GFP_NOFS); - if (ret < 0) { - kfree(failrec); - return ret; - } - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - pr_debug("bio_readpage_error: (found) logical=%llu, " - "start=%llu, len=%llu, validation=%d\n", - failrec->logical, failrec->start, failrec->len, - failrec->in_validation); - /* - * when data can be on disk more than twice, add to failrec here - * (e.g. with a list for failed_mirror) to make - * clean_io_failure() clean all those errors at once. - */ - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " - "state=%p, num_copies=%d, next_mirror %d, " - "failed_mirror %d\n", state, num_copies, - failrec->this_mirror, failed_mirror); - free_io_failure(inode, failrec, 0); - return -EIO; - } - - if (!state) { - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&tree->lock); - } - - /* - * there are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - */ - if (failed_bio->bi_vcnt > 1) { - /* - * to fulfill b), we need to know the exact failing sectors, as - * we don't want to rewrite any more than the failed ones. thus, - * we need separate read requests for the failed bio - * - * if the following BUG_ON triggers, our validation request got - * merged. we need separate requests for our algorithm to work. - */ - BUG_ON(failrec->in_validation); - failrec->in_validation = 1; - failrec->this_mirror = failed_mirror; - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - } else { - /* - * we're ready to fulfill a) and b) alongside. get a good copy - * of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - if (failrec->in_validation) { - BUG_ON(failrec->this_mirror != failed_mirror); - failrec->in_validation = 0; - failrec->this_mirror = 0; - } - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - read_mode = READ_SYNC; - } - - if (!state || failrec->this_mirror > num_copies) { - pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " - "next_mirror %d, failed_mirror %d\n", state, - num_copies, failrec->this_mirror, failed_mirror); - free_io_failure(inode, failrec, 0); - return -EIO; - } - - bio = bio_alloc(GFP_NOFS, 1); - if (!bio) { - free_io_failure(inode, failrec, 0); - return -EIO; - } - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; - - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - - pr_debug("bio_readpage_error: submitting new read[%#x] to " - "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, - failrec->this_mirror, num_copies, failrec->in_validation); - - ret = tree->ops->submit_bio_hook(inode, read_mode, bio, - failrec->this_mirror, - failrec->bio_flags, 0); - return ret; -} - -/* lots and lots of room for performance fixes in the end_bio funcs */ - -int end_extent_writepage(struct page *page, int err, u64 start, u64 end) -{ - int uptodate = (err == 0); - struct extent_io_tree *tree; - int ret; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } - - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(NULL, page, - start, end, NULL); - /* Writeback already completed */ - if (ret == 0) - return 1; - } - - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } - return 0; -} - -/* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO - * - * Scheduling is not allowed, so the extent state tree is expected - * to have one and only one object corresponding to this IO. - */ -static void end_bio_extent_writepage(struct bio *bio, int err) -{ - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_io_tree *tree; - u64 start; - u64 end; - int whole_page; - - do { - struct page *page = bvec->bv_page; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (end_extent_writepage(page, err, start, end)) - continue; - - if (whole_page) - end_page_writeback(page); - else - check_page_writeback(tree, page); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); -} - -/* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it - * - * Scheduling is not allowed, so the extent state tree is expected - * to have one and only one object corresponding to this IO. - */ -static void end_bio_extent_readpage(struct bio *bio, int err) -{ - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; - struct extent_io_tree *tree; - u64 start; - u64 end; - int whole_page; - int mirror; - int ret; - - if (err) - uptodate = 0; - - do { - struct page *page = bvec->bv_page; - struct extent_state *cached = NULL; - struct extent_state *state; - - pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " - "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, - (long int)bio->bi_bdev); - tree = &BTRFS_I(page->mapping->host)->io_tree; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + - bvec->bv_offset; - end = start + bvec->bv_len - 1; - - if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) - whole_page = 1; - else - whole_page = 0; - - if (++bvec <= bvec_end) - prefetchw(&bvec->bv_page->flags); - - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); - if (state && state->start == start) { - /* - * take a reference on the state, unlock will drop - * the ref - */ - cache_state(state, &cached); - } - spin_unlock(&tree->lock); - - mirror = (int)(unsigned long)bio->bi_bdev; - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end, - state, mirror); - if (ret) - uptodate = 0; - else - clean_io_failure(start, page); - } - - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !err && - test_bit(BIO_UPTODATE, &bio->bi_flags)) - uptodate = 1; - } else if (!uptodate) { - /* - * The generic bio_readpage_error handles errors the - * following way: If possible, new read requests are - * created and submitted and will end up in - * end_bio_extent_readpage as well (if we're lucky, not - * in the !uptodate case). In that case it returns 0 and - * we just go on with the next page in our bio. If it - * can't handle the error it will return -EIO and we - * remain responsible for that page. - */ - ret = bio_readpage_error(bio, page, start, end, mirror, NULL); - if (ret == 0) { - uptodate = - test_bit(BIO_UPTODATE, &bio->bi_flags); - if (err) - uptodate = 0; - uncache_state(&cached); - continue; - } - } - - if (uptodate && tree->track_uptodate) { - set_extent_uptodate(tree, start, end, &cached, - GFP_ATOMIC); - } - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - - if (whole_page) { - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } else { - if (uptodate) { - check_page_uptodate(tree, page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - check_page_locked(tree, page); - } - } while (bvec <= bvec_end); - - bio_put(bio); -} - -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_sector; - } - return bio; -} - -/* - * Since writes are async, they will only return -ENOMEM. - * Reads can return the full range of I/O error conditions. - */ -static int __must_check submit_one_bio(int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) -{ - int ret = 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct page *page = bvec->bv_page; - struct extent_io_tree *tree = bio->bi_private; - u64 start; - - start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; - - bio->bi_private = NULL; - - bio_get(bio); - - if (tree->ops && tree->ops->submit_bio_hook) - ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num, bio_flags, start); - else - btrfsic_submit_bio(rw, bio); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - bio_put(bio); - return ret; -} - -static int merge_bio(struct extent_io_tree *tree, struct page *page, - unsigned long offset, size_t size, struct bio *bio, - unsigned long bio_flags) -{ - int ret = 0; - if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(page, offset, size, bio, - bio_flags); - BUG_ON(ret < 0); - return ret; - -} - -static int submit_extent_page(int rw, struct extent_io_tree *tree, - struct page *page, sector_t sector, - size_t size, unsigned long offset, - struct block_device *bdev, - struct bio **bio_ret, - unsigned long max_pages, - bio_end_io_t end_io_func, - int mirror_num, - unsigned long prev_bio_flags, - unsigned long bio_flags) -{ - int ret = 0; - struct bio *bio; - int nr; - int contig = 0; - int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; - int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; - size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); - - if (bio_ret && *bio_ret) { - bio = *bio_ret; - if (old_compressed) - contig = bio->bi_sector == sector; - else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; - - if (prev_bio_flags != bio_flags || !contig || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || - bio_add_page(bio, page, page_size, offset) < page_size) { - ret = submit_one_bio(rw, bio, mirror_num, - prev_bio_flags); - if (ret < 0) - return ret; - bio = NULL; - } else { - return 0; - } - } - if (this_compressed) - nr = BIO_MAX_PAGES; - else - nr = bio_get_nr_vecs(bdev); - - bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); - if (!bio) - return -ENOMEM; - - bio_add_page(bio, page, page_size, offset); - bio->bi_end_io = end_io_func; - bio->bi_private = tree; - - if (bio_ret) - *bio_ret = bio; - else - ret = submit_one_bio(rw, bio, mirror_num, bio_flags); - - return ret; -} - -void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) -{ - if (!PagePrivate(page)) { - SetPagePrivate(page); - page_cache_get(page); - set_page_private(page, (unsigned long)eb); - } else { - WARN_ON(page->private != (unsigned long)eb); - } -} - -void set_page_extent_mapped(struct page *page) -{ - if (!PagePrivate(page)) { - SetPagePrivate(page); - page_cache_get(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } -} - -/* - * basic readpage implementation. Locked extent state structs are inserted - * into the tree that are removed when the IO is done (by the end_io - * handlers) - * XXX JDM: This needs looking at to ensure proper page locking - */ -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags) -{ - struct inode *inode = page->mapping->host; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 page_end = start + PAGE_CACHE_SIZE - 1; - u64 end; - u64 cur = start; - u64 extent_offset; - u64 last_byte = i_size_read(inode); - u64 block_start; - u64 cur_end; - sector_t sector; - struct extent_map *em; - struct block_device *bdev; - struct btrfs_ordered_extent *ordered; - int ret; - int nr = 0; - size_t pg_offset = 0; - size_t iosize; - size_t disk_io_size; - size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; - - set_page_extent_mapped(page); - - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - goto out; - } - } - - end = page_end; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_extent(inode, start); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - - if (page->index == last_byte >> PAGE_CACHE_SHIFT) { - char *userpage; - size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); - - if (zero_offset) { - iosize = PAGE_CACHE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage); - } - } - while (cur <= end) { - if (cur >= last_byte) { - char *userpage; - struct extent_state *cached = NULL; - - iosize = PAGE_CACHE_SIZE - pg_offset; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - break; - } - em = get_extent(inode, page, pg_offset, cur, - end - cur + 1, 0); - if (IS_ERR_OR_NULL(em)) { - SetPageError(page); - unlock_extent(tree, cur, end); - break; - } - extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); - BUG_ON(end < cur); - - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } - - iosize = min(extent_map_end(em) - cur, end - cur + 1); - cur_end = min(extent_map_end(em) - 1, end); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) { - disk_io_size = em->block_len; - sector = em->block_start >> 9; - } else { - sector = (em->block_start + extent_offset) >> 9; - disk_io_size = iosize; - } - bdev = em->bdev; - block_start = em->block_start; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - block_start = EXTENT_MAP_HOLE; - free_extent_map(em); - em = NULL; - - /* we've found a hole, just zero and go on */ - if (block_start == EXTENT_MAP_HOLE) { - char *userpage; - struct extent_state *cached = NULL; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); - flush_dcache_page(page); - kunmap_atomic(userpage); - - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, - EXTENT_UPTODATE, 1, NULL)) { - check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - /* we have an inline extent but it didn't get marked up - * to date. Error out - */ - if (block_start == EXTENT_MAP_INLINE) { - SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1); - cur = cur + iosize; - pg_offset += iosize; - continue; - } - - ret = 0; - if (tree->ops && tree->ops->readpage_io_hook) { - ret = tree->ops->readpage_io_hook(page, cur, - cur + iosize - 1); - } - if (!ret) { - unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; - pnr -= page->index; - ret = submit_extent_page(READ, tree, page, - sector, disk_io_size, pg_offset, - bdev, bio, pnr, - end_bio_extent_readpage, mirror_num, - *bio_flags, - this_bio_flag); - BUG_ON(ret == -ENOMEM); - nr++; - *bio_flags = this_bio_flag; - } - if (ret) - SetPageError(page); - cur = cur + iosize; - pg_offset += iosize; - } -out: - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } - return 0; -} - -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags); - if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - return ret; -} - -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; -} - -/* - * the writepage semantics are similar to regular writepage. extent - * records are inserted to lock ranges in the tree, and as dirty areas - * are found, they are marked writeback. Then the lock bits are removed - * and the end_io handler clears the writeback ranges - */ -static int __extent_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct inode *inode = page->mapping->host; - struct extent_page_data *epd = data; - struct extent_io_tree *tree = epd->tree; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 delalloc_start; - u64 page_end = start + PAGE_CACHE_SIZE - 1; - u64 end; - u64 cur = start; - u64 extent_offset; - u64 last_byte = i_size_read(inode); - u64 block_start; - u64 iosize; - sector_t sector; - struct extent_state *cached_state = NULL; - struct extent_map *em; - struct block_device *bdev; - int ret; - int nr = 0; - size_t pg_offset = 0; - size_t blocksize; - loff_t i_size = i_size_read(inode); - unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; - u64 nr_delalloc; - u64 delalloc_end; - int page_started; - int compressed; - int write_flags; - unsigned long nr_written = 0; - bool fill_delalloc = true; - - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; - else - write_flags = WRITE; - - trace___extent_writepage(page, inode, wbc); - - WARN_ON(!PageLocked(page)); - - ClearPageError(page); - - pg_offset = i_size & (PAGE_CACHE_SIZE - 1); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - return 0; - } - - if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_CACHE_SIZE - pg_offset); - kunmap_atomic(userpage); - flush_dcache_page(page); - } - pg_offset = 0; - - set_page_extent_mapped(page); - - if (!tree->ops || !tree->ops->fill_delalloc) - fill_delalloc = false; - - delalloc_start = start; - delalloc_end = 0; - page_started = 0; - if (!epd->extent_locked && fill_delalloc) { - u64 delalloc_to_write = 0; - /* - * make sure the wbc mapping index is at least updated - * to this page. - */ - update_nr_written(page, wbc, 0); - - while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, - page, - &delalloc_start, - &delalloc_end, - 128 * 1024 * 1024); - if (nr_delalloc == 0) { - delalloc_start = delalloc_end + 1; - continue; - } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - &nr_written); - /* File system has been set read-only */ - if (ret) { - SetPageError(page); - goto done; - } - /* - * delalloc_end is already one less than the total - * length, so we don't subtract one from - * PAGE_CACHE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - delalloc_start = delalloc_end + 1; - } - if (wbc->nr_to_write < delalloc_to_write) { - int thresh = 8192; - - if (delalloc_to_write < thresh * 2) - thresh = delalloc_to_write; - wbc->nr_to_write = min_t(u64, delalloc_to_write, - thresh); - } - - /* did the fill delalloc function already unlock and start - * the IO? - */ - if (page_started) { - ret = 0; - /* - * we've unlocked the page, so we can't update - * the mapping's writeback index, just update - * nr_to_write. - */ - wbc->nr_to_write -= nr_written; - goto done_unlocked; - } - } - if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, - page_end); - if (ret) { - /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); - update_nr_written(page, wbc, nr_written); - unlock_page(page); - ret = 0; - goto done_unlocked; - } - } - - /* - * we don't want to touch the inode after unlocking the page, - * so we update the mapping writeback index now - */ - update_nr_written(page, wbc, nr_written + 1); - - end = page_end; - if (last_byte <= start) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - page_end, NULL, 1); - goto done; - } - - blocksize = inode->i_sb->s_blocksize; - - while (cur <= end) { - if (cur >= last_byte) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - page_end, NULL, 1); - break; - } - em = epd->get_extent(inode, page, pg_offset, cur, - end - cur + 1, 1); - if (IS_ERR_OR_NULL(em)) { - SetPageError(page); - break; - } - - extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); - BUG_ON(end < cur); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; - bdev = em->bdev; - block_start = em->block_start; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - free_extent_map(em); - em = NULL; - - /* - * compressed and inline extents are written through other - * paths in the FS - */ - if (compressed || block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - /* - * end_io notification does not happen here for - * compressed extents - */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); - else if (compressed) { - /* we don't want to end_page_writeback on - * a compressed extent. this happens - * elsewhere - */ - nr++; - } - - cur += iosize; - pg_offset += iosize; - continue; - } - /* leave this out until we have a page_mkwrite call */ - if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0, NULL)) { - cur = cur + iosize; - pg_offset += iosize; - continue; - } - - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; - } - if (ret) { - SetPageError(page); - } else { - unsigned long max_nr = end_index + 1; - - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - printk(KERN_ERR "btrfs warning page %lu not " - "writeback, cur %llu end %llu\n", - page->index, (unsigned long long)cur, - (unsigned long long)end); - } - - ret = submit_extent_page(write_flags, tree, page, - sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, - end_bio_extent_writepage, - 0, 0, 0); - if (ret) - SetPageError(page); - } - cur = cur + iosize; - pg_offset += iosize; - nr++; - } -done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); - } - unlock_page(page); - -done_unlocked: - - /* drop our reference on any cached states */ - free_extent_state(cached_state); - return 0; -} - -static int eb_wait(void *word) -{ - io_schedule(); - return 0; -} - -static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) -{ - wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, - TASK_UNINTERRUPTIBLE); -} - -static int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct extent_page_data *epd) -{ - unsigned long i, num_pages; - int flush = 0; - int ret = 0; - - if (!btrfs_try_tree_write_lock(eb)) { - flush = 1; - flush_write_bio(epd); - btrfs_tree_lock(eb); - } - - if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { - btrfs_tree_unlock(eb); - if (!epd->sync_io) - return 0; - if (!flush) { - flush_write_bio(epd); - flush = 1; - } - while (1) { - wait_on_extent_buffer_writeback(eb); - btrfs_tree_lock(eb); - if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) - break; - btrfs_tree_unlock(eb); - } - } - - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { - set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - spin_lock(&fs_info->delalloc_lock); - if (fs_info->dirty_metadata_bytes >= eb->len) - fs_info->dirty_metadata_bytes -= eb->len; - else - WARN_ON(1); - spin_unlock(&fs_info->delalloc_lock); - ret = 1; - } - - btrfs_tree_unlock(eb); - - if (!ret) - return ret; - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - - if (!trylock_page(p)) { - if (!flush) { - flush_write_bio(epd); - flush = 1; - } - lock_page(p); - } - } - - return ret; -} - -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_clear_bit(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) -{ - int uptodate = err == 0; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct extent_buffer *eb; - int done; - - do { - struct page *page = bvec->bv_page; - - bvec--; - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - done = atomic_dec_and_test(&eb->io_pages); - - if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - ClearPageUptodate(page); - SetPageError(page); - } - - end_page_writeback(page); - - if (!done) - continue; - - end_extent_buffer_writeback(eb); - } while (bvec >= bio->bi_io_vec); - - bio_put(bio); - -} - -static int write_one_eb(struct extent_buffer *eb, - struct btrfs_fs_info *fs_info, - struct writeback_control *wbc, - struct extent_page_data *epd) -{ - struct block_device *bdev = fs_info->fs_devices->latest_bdev; - u64 offset = eb->start; - unsigned long i, num_pages; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE); - int ret; - - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); - atomic_set(&eb->io_pages, num_pages); - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - - clear_page_dirty_for_io(p); - set_page_writeback(p); - ret = submit_extent_page(rw, eb->tree, p, offset >> 9, - PAGE_CACHE_SIZE, 0, bdev, &epd->bio, - -1, end_bio_extent_buffer_writepage, - 0, 0, 0); - if (ret) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - SetPageError(p); - if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) - end_extent_buffer_writeback(eb); - ret = -EIO; - break; - } - offset += PAGE_CACHE_SIZE; - update_nr_written(p, wbc, 1); - unlock_page(p); - } - - if (unlikely(ret)) { - for (; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - unlock_page(p); - } - } - - return ret; -} - -int btree_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; - struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; - struct extent_buffer *eb, *prev_eb = NULL; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - int ret = 0; - int done = 0; - int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - int scanned = 0; - int tag; - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - scanned = 1; - } - if (wbc->sync_mode == WB_SYNC_ALL) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; -retry: - if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); - while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; - - scanned = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (!PagePrivate(page)) - continue; - - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - - eb = (struct extent_buffer *)page->private; - if (!eb) { - WARN_ON(1); - continue; - } - - if (eb == prev_eb) - continue; - - if (!atomic_inc_not_zero(&eb->refs)) { - WARN_ON(1); - continue; - } - - prev_eb = eb; - ret = lock_extent_buffer_for_io(eb, fs_info, &epd); - if (!ret) { - free_extent_buffer(eb); - continue; - } - - ret = write_one_eb(eb, fs_info, wbc, &epd); - if (ret) { - done = 1; - free_extent_buffer(eb); - break; - } - free_extent_buffer(eb); - - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; - } - pagevec_release(&pvec); - cond_resched(); - } - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = 1; - index = 0; - goto retry; - } - flush_write_bio(&epd); - return ret; -} - -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * If a page is already under I/O, write_cache_pages() skips it, even - * if it's dirty. This is desirable behaviour for memory-cleaning writeback, - * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() - * and msync() need to guarantee that all the data which was dirty at the time - * the call was made get new I/O started against them. If wbc->sync_mode is - * WB_SYNC_ALL then we were called for data integrity and we must wait for - * existing IO to complete. - */ -static int extent_write_cache_pages(struct extent_io_tree *tree, - struct address_space *mapping, - struct writeback_control *wbc, - writepage_t writepage, void *data, - void (*flush_fn)(void *)) -{ - int ret = 0; - int done = 0; - int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - int scanned = 0; - int tag; - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - scanned = 1; - } - if (wbc->sync_mode == WB_SYNC_ALL) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; -retry: - if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); - while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - unsigned i; - - scanned = 1; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point we hold neither mapping->tree_lock nor - * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even - * swizzled back from swapper_space to tmpfs file - * mapping - */ - if (tree->ops && - tree->ops->write_cache_pages_lock_hook) { - tree->ops->write_cache_pages_lock_hook(page, - data, flush_fn); - } else { - if (!trylock_page(page)) { - flush_fn(data); - lock_page(page); - } - } - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } - - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - - if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) - flush_fn(data); - wait_on_page_writeback(page); - } - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; - } - - ret = (*writepage)(page, wbc, data); - - if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { - unlock_page(page); - ret = 0; - } - if (ret) - done = 1; - - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; - } - pagevec_release(&pvec); - cond_resched(); - } - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = 1; - index = 0; - goto retry; - } - return ret; -} - -static void flush_epd_write_bio(struct extent_page_data *epd) -{ - if (epd->bio) { - int rw = WRITE; - int ret; - - if (epd->sync_io) - rw = WRITE_SYNC; - - ret = submit_one_bio(rw, epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ - epd->bio = NULL; - } -} - -static noinline void flush_write_bio(void *data) -{ - struct extent_page_data *epd = data; - flush_epd_write_bio(epd); -} - -int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - - flush_epd_write_bio(&epd); - return ret; -} - -int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, - int mode) -{ - int ret = 0; - struct address_space *mapping = inode->i_mapping; - struct page *page; - unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - .extent_locked = 1, - .sync_io = mode == WB_SYNC_ALL, - }; - struct writeback_control wbc_writepages = { - .sync_mode = mode, - .nr_to_write = nr_pages * 2, - .range_start = start, - .range_end = end + 1, - }; - - while (start <= end) { - page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); - if (clear_page_dirty_for_io(page)) - ret = __extent_writepage(page, &wbc_writepages, &epd); - else { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - start + PAGE_CACHE_SIZE - 1, - NULL, 1); - unlock_page(page); - } - page_cache_release(page); - start += PAGE_CACHE_SIZE; - } - - flush_epd_write_bio(&epd); - return ret; -} - -int extent_writepages(struct extent_io_tree *tree, - struct address_space *mapping, - get_extent_t *get_extent, - struct writeback_control *wbc) -{ - int ret = 0; - struct extent_page_data epd = { - .bio = NULL, - .tree = tree, - .get_extent = get_extent, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = extent_write_cache_pages(tree, mapping, wbc, - __extent_writepage, &epd, - flush_write_bio); - flush_epd_write_bio(&epd); - return ret; -} - -int extent_readpages(struct extent_io_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent) -{ - struct bio *bio = NULL; - unsigned page_idx; - unsigned long bio_flags = 0; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_entry(pages->prev, struct page, lru); - - prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, GFP_NOFS)) { - __extent_read_full_page(tree, page, get_extent, - &bio, 0, &bio_flags); - } - page_cache_release(page); - } - BUG_ON(!list_empty(pages)); - if (bio) - return submit_one_bio(READ, bio, 0, bio_flags); - return 0; -} - -/* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state - * records from the tree - */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) -{ - struct extent_state *cached_state = NULL; - u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); - u64 end = start + PAGE_CACHE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; - - start += (offset + blocksize - 1) & ~(blocksize - 1); - if (start > end) - return 0; - - lock_extent_bits(tree, start, end, 0, &cached_state); - wait_on_page_writeback(page); - clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state, GFP_NOFS); - return 0; -} - -/* - * a helper for releasepage, this tests for areas of the page that - * are locked or under IO and drops the related state bits if it is safe - * to drop the page. - */ -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) -{ - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - int ret = 1; - - if (test_range_bit(tree, start, end, - EXTENT_IOBITS, 0, NULL)) - ret = 0; - else { - if ((mask & GFP_NOFS) == GFP_NOFS) - mask = GFP_NOFS; - /* - * at this point we can safely clear everything except the - * locked bit and the nodatasum bit - */ - ret = clear_extent_bit(tree, start, end, - ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask); - - /* if clear_extent_bit failed for enomem reasons, - * we can't allow the release to continue. - */ - if (ret < 0) - ret = 0; - else - ret = 1; - } - return ret; -} - -/* - * a helper for releasepage. As long as there are no locked extents - * in the range corresponding to the page, both state records and extent - * map records are removed - */ -int try_release_extent_mapping(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) -{ - struct extent_map *em; - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - - if ((mask & __GFP_WAIT) && - page->mapping->host->i_size > 16 * 1024 * 1024) { - u64 len; - while (start <= end) { - len = end - start + 1; - write_lock(&map->lock); - em = lookup_extent_mapping(map, start, len); - if (!em) { - write_unlock(&map->lock); - break; - } - if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || - em->start != start) { - write_unlock(&map->lock); - free_extent_map(em); - break; - } - if (!test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK, - 0, NULL)) { - remove_extent_mapping(map, em); - /* once for the rb tree */ - free_extent_map(em); - } - start = extent_map_end(em); - write_unlock(&map->lock); - - /* once for us */ - free_extent_map(em); - } - } - return try_release_extent_state(map, tree, page, mask); -} - -/* - * helper function for fiemap, which doesn't want to see any holes. - * This maps until we find something past 'last' - */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, - u64 offset, - u64 last, - get_extent_t *get_extent) -{ - u64 sectorsize = BTRFS_I(inode)->root->sectorsize; - struct extent_map *em; - u64 len; - - if (offset >= last) - return NULL; - - while(1) { - len = last - offset; - if (len == 0) - break; - len = (len + sectorsize - 1) & ~(sectorsize - 1); - em = get_extent(inode, NULL, 0, offset, len, 0); - if (IS_ERR_OR_NULL(em)) - return em; - - /* if this isn't a hole return it */ - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && - em->block_start != EXTENT_MAP_HOLE) { - return em; - } - - /* this is a hole, advance to the next extent */ - offset = extent_map_end(em); - free_extent_map(em); - if (offset >= last) - break; - } - return NULL; -} - -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent) -{ - int ret = 0; - u64 off = start; - u64 max = start + len; - u32 flags = 0; - u32 found_type; - u64 last; - u64 last_for_get_extent = 0; - u64 disko = 0; - u64 isize = i_size_read(inode); - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - struct btrfs_path *path; - struct btrfs_file_extent_item *item; - int end = 0; - u64 em_start = 0; - u64 em_len = 0; - u64 em_end = 0; - unsigned long emflags; - - if (len == 0) - return -EINVAL; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); - len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); - - /* - * lookup the last file extent. We're not using i_size here - * because there might be preallocation past i_size - */ - ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, - path, btrfs_ino(inode), -1, 0); - if (ret < 0) { - btrfs_free_path(path); - return ret; - } - WARN_ON(!ret); - path->slots[0]--; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); - - /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(inode) || - found_type != BTRFS_EXTENT_DATA_KEY) { - /* have to trust i_size as the end */ - last = (u64)-1; - last_for_get_extent = isize; - } else { - /* - * remember the start of the last extent. There are a - * bunch of different factors that go into the length of the - * extent, so its much less complex to remember where it started - */ - last = found_key.offset; - last_for_get_extent = last + 1; - } - btrfs_free_path(path); - - /* - * we might have some extents allocated but more delalloc past those - * extents. so, we trust isize unless the start of the last extent is - * beyond isize - */ - if (last < isize) { - last = (u64)-1; - last_for_get_extent = isize; - } - - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, - &cached_state); - - em = get_extent_skip_holes(inode, start, last_for_get_extent, - get_extent); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - - while (!end) { - u64 offset_in_extent; - - /* break if the extent we found is outside the range */ - if (em->start >= max || extent_map_end(em) < off) - break; - - /* - * get_extent may return an extent that starts before our - * requested range. We have to make sure the ranges - * we return to fiemap always move forward and don't - * overlap, so adjust the offsets here - */ - em_start = max(em->start, off); - - /* - * record the offset from the start of the extent - * for adjusting the disk offset below - */ - offset_in_extent = em_start - em->start; - em_end = extent_map_end(em); - em_len = em_end - em_start; - emflags = em->flags; - disko = 0; - flags = 0; - - /* - * bump off for our next call to get_extent - */ - off = extent_map_end(em); - if (off >= max) - end = 1; - - if (em->block_start == EXTENT_MAP_LAST_BYTE) { - end = 1; - flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_INLINE) { - flags |= (FIEMAP_EXTENT_DATA_INLINE | - FIEMAP_EXTENT_NOT_ALIGNED); - } else if (em->block_start == EXTENT_MAP_DELALLOC) { - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } else { - disko = em->block_start + offset_in_extent; - } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - flags |= FIEMAP_EXTENT_ENCODED; - - free_extent_map(em); - em = NULL; - if ((em_start >= last) || em_len == (u64)-1 || - (last == (u64)-1 && isize <= em_end)) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; - } - - /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent, - get_extent); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - if (!em) { - flags |= FIEMAP_EXTENT_LAST; - end = 1; - } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } -out_free: - free_extent_map(em); -out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, - &cached_state, GFP_NOFS); - return ret; -} - -inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - return eb->pages[i]; -} - -inline unsigned long num_extent_pages(u64 start, u64 len) -{ - return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - - (start >> PAGE_CACHE_SHIFT); -} - -static void __free_extent_buffer(struct extent_buffer *eb) -{ -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - if (eb->pages && eb->pages != eb->inline_pages) - kfree(eb->pages); - kmem_cache_free(extent_buffer_cache, eb); -} - -static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, - unsigned long len, - gfp_t mask) -{ - struct extent_buffer *eb = NULL; -#if LEAK_DEBUG - unsigned long flags; -#endif - - eb = kmem_cache_zalloc(extent_buffer_cache, mask); - if (eb == NULL) - return NULL; - eb->start = start; - eb->len = len; - eb->tree = tree; - rwlock_init(&eb->lock); - atomic_set(&eb->write_locks, 0); - atomic_set(&eb->read_locks, 0); - atomic_set(&eb->blocking_readers, 0); - atomic_set(&eb->blocking_writers, 0); - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->spinning_writers, 0); - eb->lock_nested = 0; - init_waitqueue_head(&eb->write_lock_wq); - init_waitqueue_head(&eb->read_lock_wq); - -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&eb->leak_list, &buffers); - spin_unlock_irqrestore(&leak_lock, flags); -#endif - spin_lock_init(&eb->refs_lock); - atomic_set(&eb->refs, 1); - atomic_set(&eb->io_pages, 0); - - if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { - struct page **pages; - int num_pages = (len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - pages = kzalloc(num_pages, mask); - if (!pages) { - __free_extent_buffer(eb); - return NULL; - } - eb->pages = pages; - } else { - eb->pages = eb->inline_pages; - } - - return eb; -} - -static int extent_buffer_under_io(struct extent_buffer *eb) -{ - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || - test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); -} - -/* - * Helper for releasing extent buffer page. - */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, - unsigned long start_idx) -{ - unsigned long index; - struct page *page; - - BUG_ON(extent_buffer_under_io(eb)); - - index = num_extent_pages(eb->start, eb->len); - if (start_idx >= index) - return; - - do { - index--; - page = extent_buffer_page(eb, index); - if (page) { - spin_lock(&page->mapping->private_lock); - /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. - */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - /* One for when we alloced the page */ - page_cache_release(page); - } - } while (index != start_idx); -} - -/* - * Helper for releasing the extent buffer. - */ -static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) -{ - btrfs_release_extent_buffer_page(eb, 0); - __free_extent_buffer(eb); -} - -static void check_buffer_tree_ref(struct extent_buffer *eb) -{ - /* the ref bit is tricky. We have to make sure it is set - * if we have the buffer dirty. Otherwise the - * code to free a buffer can end up dropping a dirty - * page - * - * Once the ref bit is set, it won't go away while the - * buffer is dirty or in writeback, and it also won't - * go away while we have the reference count on the - * eb bumped. - * - * We can't just set the ref bit without bumping the - * ref on the eb because free_extent_buffer might - * see the ref bit and try to clear it. If this happens - * free_extent_buffer might end up dropping our original - * ref by mistake and freeing the page before we are able - * to add one more ref. - * - * So bump the ref count first, then set the bit. If someone - * beat us to it, drop the ref we added. - */ - if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { - atomic_inc(&eb->refs); - if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - } -} - -static void mark_extent_buffer_accessed(struct extent_buffer *eb) -{ - unsigned long num_pages, i; - - check_buffer_tree_ref(eb); - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); - } -} - -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len) -{ - unsigned long num_pages = num_extent_pages(start, len); - unsigned long i; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct extent_buffer *eb; - struct extent_buffer *exists = NULL; - struct page *p; - struct address_space *mapping = tree->mapping; - int uptodate = 1; - int ret; - - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_extent_buffer_accessed(eb); - return eb; - } - rcu_read_unlock(); - - eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); - if (!eb) - return NULL; - - for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS); - if (!p) { - WARN_ON(1); - goto free_eb; - } - - spin_lock(&mapping->private_lock); - if (PagePrivate(p)) { - /* - * We could have already allocated an eb for this page - * and attached one so lets see if we can get a ref on - * the existing eb, and if we can we know it's good and - * we can just return that one, else we know we can just - * overwrite page->private. - */ - exists = (struct extent_buffer *)p->private; - if (atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - page_cache_release(p); - mark_extent_buffer_accessed(exists); - goto free_eb; - } - - /* - * Do this so attach doesn't complain and we need to - * drop the ref the old guy had. - */ - ClearPagePrivate(p); - WARN_ON(PageDirty(p)); - page_cache_release(p); - } - attach_extent_buffer_page(eb, p); - spin_unlock(&mapping->private_lock); - WARN_ON(PageDirty(p)); - mark_page_accessed(p); - eb->pages[i] = p; - if (!PageUptodate(p)) - uptodate = 0; - - /* - * see below about how we avoid a nasty race with release page - * and why we unlock later - */ - } - if (uptodate) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) - goto free_eb; - - spin_lock(&tree->buffer_lock); - ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); - if (ret == -EEXIST) { - exists = radix_tree_lookup(&tree->buffer, - start >> PAGE_CACHE_SHIFT); - if (!atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - exists = NULL; - goto again; - } - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - mark_extent_buffer_accessed(exists); - goto free_eb; - } - /* add one reference for the tree */ - spin_lock(&eb->refs_lock); - check_buffer_tree_ref(eb); - spin_unlock(&eb->refs_lock); - spin_unlock(&tree->buffer_lock); - radix_tree_preload_end(); - - /* - * there is a race where release page may have - * tried to find this extent buffer in the radix - * but failed. It will tell the VM it is safe to - * reclaim the, and it will clear the page private bit. - * We must make sure to set the page private bit properly - * after the extent buffer is in the radix tree so - * it doesn't get lost - */ - SetPageChecked(eb->pages[0]); - for (i = 1; i < num_pages; i++) { - p = extent_buffer_page(eb, i); - ClearPageChecked(p); - unlock_page(p); - } - unlock_page(eb->pages[0]); - return eb; - -free_eb: - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - unlock_page(eb->pages[i]); - } - - WARN_ON(!atomic_dec_and_test(&eb->refs)); - btrfs_release_extent_buffer(eb); - return exists; -} - -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len) -{ - struct extent_buffer *eb; - - rcu_read_lock(); - eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - mark_extent_buffer_accessed(eb); - return eb; - } - rcu_read_unlock(); - - return NULL; -} - -static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) -{ - struct extent_buffer *eb = - container_of(head, struct extent_buffer, rcu_head); - - __free_extent_buffer(eb); -} - -/* Expects to have eb->eb_lock already held */ -static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask) -{ - WARN_ON(atomic_read(&eb->refs) == 0); - if (atomic_dec_and_test(&eb->refs)) { - struct extent_io_tree *tree = eb->tree; - - spin_unlock(&eb->refs_lock); - - spin_lock(&tree->buffer_lock); - radix_tree_delete(&tree->buffer, - eb->start >> PAGE_CACHE_SHIFT); - spin_unlock(&tree->buffer_lock); - - /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_page(eb, 0); - - call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); - return; - } - spin_unlock(&eb->refs_lock); -} - -void free_extent_buffer(struct extent_buffer *eb) -{ - if (!eb) - return; - - spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && - !extent_buffer_under_io(eb) && - test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - - /* - * I know this is terrible, but it's temporary until we stop tracking - * the uptodate bits and such for the extent buffers. - */ - release_extent_buffer(eb, GFP_ATOMIC); -} - -void free_extent_buffer_stale(struct extent_buffer *eb) -{ - if (!eb) - return; - - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_STALE, &eb->bflags); - - if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && - test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) - atomic_dec(&eb->refs); - release_extent_buffer(eb, GFP_NOFS); -} - -void clear_extent_buffer_dirty(struct extent_buffer *eb) -{ - unsigned long i; - unsigned long num_pages; - struct page *page; - - num_pages = num_extent_pages(eb->start, eb->len); - - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageDirty(page)) - continue; - - lock_page(page); - WARN_ON(!PagePrivate(page)); - - clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - } - spin_unlock_irq(&page->mapping->tree_lock); - ClearPageError(page); - unlock_page(page); - } - WARN_ON(atomic_read(&eb->refs) == 0); -} - -int set_extent_buffer_dirty(struct extent_buffer *eb) -{ - unsigned long i; - unsigned long num_pages; - int was_dirty = 0; - - check_buffer_tree_ref(eb); - - was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - - num_pages = num_extent_pages(eb->start, eb->len); - WARN_ON(atomic_read(&eb->refs) == 0); - WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); - - for (i = 0; i < num_pages; i++) - set_page_dirty(extent_buffer_page(eb, i)); - return was_dirty; -} - -static int range_straddles_pages(u64 start, u64 len) -{ - if (len < PAGE_CACHE_SIZE) - return 1; - if (start & (PAGE_CACHE_SIZE - 1)) - return 1; - if ((start + len) & (PAGE_CACHE_SIZE - 1)) - return 1; - return 0; -} - -int clear_extent_buffer_uptodate(struct extent_buffer *eb) -{ - unsigned long i; - struct page *page; - unsigned long num_pages; - - clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (page) - ClearPageUptodate(page); - } - return 0; -} - -int set_extent_buffer_uptodate(struct extent_buffer *eb) -{ - unsigned long i; - struct page *page; - unsigned long num_pages; - - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - SetPageUptodate(page); - } - return 0; -} - -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end) -{ - struct page *page; - int ret; - int pg_uptodate = 1; - int uptodate; - unsigned long index; - - if (range_straddles_pages(start, end - start + 1)) { - ret = test_range_bit(tree, start, end, - EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; - } - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - page = find_get_page(tree->mapping, index); - if (!page) - return 1; - uptodate = PageUptodate(page); - page_cache_release(page); - if (!uptodate) { - pg_uptodate = 0; - break; - } - start += PAGE_CACHE_SIZE; - } - return pg_uptodate; -} - -int extent_buffer_uptodate(struct extent_buffer *eb) -{ - return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -} - -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, u64 start, int wait, - get_extent_t *get_extent, int mirror_num) -{ - unsigned long i; - unsigned long start_i; - struct page *page; - int err; - int ret = 0; - int locked_pages = 0; - int all_uptodate = 1; - unsigned long num_pages; - unsigned long num_reads = 0; - struct bio *bio = NULL; - unsigned long bio_flags = 0; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 0; - - if (start) { - WARN_ON(start < eb->start); - start_i = (start >> PAGE_CACHE_SHIFT) - - (eb->start >> PAGE_CACHE_SHIFT); - } else { - start_i = 0; - } - - num_pages = num_extent_pages(eb->start, eb->len); - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (wait == WAIT_NONE) { - if (!trylock_page(page)) - goto unlock_exit; - } else { - lock_page(page); - } - locked_pages++; - if (!PageUptodate(page)) { - num_reads++; - all_uptodate = 0; - } - } - if (all_uptodate) { - if (start_i == 0) - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - goto unlock_exit; - } - - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->read_mirror = 0; - atomic_set(&eb->io_pages, num_reads); - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - if (!PageUptodate(page)) { - ClearPageError(page); - err = __extent_read_full_page(tree, page, - get_extent, &bio, - mirror_num, &bio_flags); - if (err) - ret = err; - } else { - unlock_page(page); - } - } - - if (bio) { - err = submit_one_bio(READ, bio, mirror_num, bio_flags); - if (err) - return err; - } - - if (ret || wait != WAIT_COMPLETE) - return ret; - - for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); - wait_on_page_locked(page); - if (!PageUptodate(page)) - ret = -EIO; - } - - return ret; - -unlock_exit: - i = start_i; - while (locked_pages > 0) { - page = extent_buffer_page(eb, i); - i++; - unlock_page(page); - locked_pages--; - } - return ret; -} - -void read_extent_buffer(struct extent_buffer *eb, void *dstv, - unsigned long start, - unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while (len > 0) { - page = extent_buffer_page(eb, i); - - cur = min(len, (PAGE_CACHE_SIZE - offset)); - kaddr = page_address(page); - memcpy(dst, kaddr + offset, cur); - - dst += cur; - len -= cur; - offset = 0; - i++; - } -} - -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **map, - unsigned long *map_start, - unsigned long *map_len) -{ - size_t offset = start & (PAGE_CACHE_SIZE - 1); - char *kaddr; - struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_CACHE_SHIFT; - - if (i != end_i) - return -EINVAL; - - if (i == 0) { - offset = start_offset; - *map_start = 0; - } else { - offset = 0; - *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; - } - - if (start + min_len > eb->len) { - printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " - "wanted %lu %lu\n", (unsigned long long)eb->start, - eb->len, start, min_len); - WARN_ON(1); - return -EINVAL; - } - - p = extent_buffer_page(eb, i); - kaddr = page_address(p); - *map = kaddr + offset; - *map_len = PAGE_CACHE_SIZE - offset; - return 0; -} - -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - int ret = 0; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while (len > 0) { - page = extent_buffer_page(eb, i); - - cur = min(len, (PAGE_CACHE_SIZE - offset)); - - kaddr = page_address(page); - ret = memcmp(ptr, kaddr + offset, cur); - if (ret) - break; - - ptr += cur; - len -= cur; - offset = 0; - i++; - } - return ret; -} - -void write_extent_buffer(struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while (len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = page_address(page); - memcpy(kaddr + offset, src, cur); - - src += cur; - len -= cur; - offset = 0; - i++; - } -} - -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len) -{ - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; - - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); - - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); - - while (len > 0) { - page = extent_buffer_page(eb, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, PAGE_CACHE_SIZE - offset); - kaddr = page_address(page); - memset(kaddr + offset, c, cur); - - len -= cur; - offset = 0; - i++; - } -} - -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, - unsigned long dst_offset, unsigned long src_offset, - unsigned long len) -{ - u64 dst_len = dst->len; - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - - WARN_ON(src->len != dst_len); - - offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - while (len > 0) { - page = extent_buffer_page(dst, i); - WARN_ON(!PageUptodate(page)); - - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); - - kaddr = page_address(page); - read_extent_buffer(src, kaddr + offset, src_offset, cur); - - src_offset += cur; - len -= cur; - offset = 0; - i++; - } -} - -static void move_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - if (dst_page == src_page) { - memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); - } else { - char *src_kaddr = page_address(src_page); - char *p = dst_kaddr + dst_off + len; - char *s = src_kaddr + src_off + len; - - while (len--) - *--p = *--s; - } -} - -static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) -{ - unsigned long distance = (src > dst) ? src - dst : dst - src; - return distance < len; -} - -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - char *src_kaddr; - int must_memmove = 0; - - if (dst_page != src_page) { - src_kaddr = page_address(src_page); - } else { - src_kaddr = dst_kaddr; - if (areas_overlap(src_off, dst_off, len)) - must_memmove = 1; - } - - if (must_memmove) - memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); - else - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); -} - -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) -{ - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long dst_i; - unsigned long src_i; - - if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " - "len %lu dst len %lu\n", src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " - "len %lu dst len %lu\n", dst_offset, len, dst->len); - BUG_ON(1); - } - - while (len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; - - cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); - - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; - } -} - -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) -{ - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_end = dst_offset + len - 1; - unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); - unsigned long dst_i; - unsigned long src_i; - - if (src_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " - "len %lu len %lu\n", src_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset + len > dst->len) { - printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " - "len %lu len %lu\n", dst_offset, len, dst->len); - BUG_ON(1); - } - if (dst_offset < src_offset) { - memcpy_extent_buffer(dst, dst_offset, src_offset, len); - return; - } - while (len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; - - dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); - - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); - move_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); - - dst_end -= cur; - src_end -= cur; - len -= cur; - } -} - -int try_release_extent_buffer(struct page *page, gfp_t mask) -{ - struct extent_buffer *eb; - - /* - * We need to make sure noboody is attaching this page to an eb right - * now. - */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); - return 1; - } - - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - - /* - * This is a little awful but should be ok, we need to make sure that - * the eb doesn't disappear out from under us while we're looking at - * this page. - */ - spin_lock(&eb->refs_lock); - if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { - spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); - return 0; - } - spin_unlock(&page->mapping->private_lock); - - if ((mask & GFP_NOFS) == GFP_NOFS) - mask = GFP_NOFS; - - /* - * If tree ref isn't set then we know the ref on this eb is a real ref, - * so just return, this page will likely be freed soon anyway. - */ - if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { - spin_unlock(&eb->refs_lock); - return 0; - } - release_extent_buffer(eb, mask); - - return 1; -} diff --git a/ANDROID_3.4.5/fs/btrfs/extent_io.h b/ANDROID_3.4.5/fs/btrfs/extent_io.h deleted file mode 100644 index b516c3b8..00000000 --- a/ANDROID_3.4.5/fs/btrfs/extent_io.h +++ /dev/null @@ -1,331 +0,0 @@ -#ifndef __EXTENTIO__ -#define __EXTENTIO__ - -#include <linux/rbtree.h> - -/* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) -#define EXTENT_DEFRAG (1 << 6) -#define EXTENT_DEFRAG_DONE (1 << 7) -#define EXTENT_BUFFER_FILLED (1 << 8) -#define EXTENT_BOUNDARY (1 << 9) -#define EXTENT_NODATASUM (1 << 10) -#define EXTENT_DO_ACCOUNTING (1 << 11) -#define EXTENT_FIRST_DELALLOC (1 << 12) -#define EXTENT_NEED_WAIT (1 << 13) -#define EXTENT_DAMAGED (1 << 14) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) - -/* - * flags for bio submission. The high bits indicate the compression - * type for this bio - */ -#define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_FLAG_SHIFT 16 - -/* these are bit numbers for test/set bit */ -#define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_BLOCKING 1 -#define EXTENT_BUFFER_DIRTY 2 -#define EXTENT_BUFFER_CORRUPT 3 -#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_IOERR 8 - -/* these are flags for extent_clear_unlock_delalloc */ -#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -#define EXTENT_CLEAR_UNLOCK 0x2 -#define EXTENT_CLEAR_DELALLOC 0x4 -#define EXTENT_CLEAR_DIRTY 0x8 -#define EXTENT_SET_WRITEBACK 0x10 -#define EXTENT_END_WRITEBACK 0x20 -#define EXTENT_SET_PRIVATE2 0x40 -#define EXTENT_CLEAR_ACCOUNTING 0x80 - -/* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. - */ -#define EXTENT_PAGE_PRIVATE 1 -#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 - -struct extent_state; -struct btrfs_root; - -typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset); -struct extent_io_ops { - int (*fill_delalloc)(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written); - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); - extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags); - int (*readpage_io_hook)(struct page *page, u64 start, u64 end); - int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, - struct extent_state *state); - int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror); - int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate); - void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); - void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); - void (*merge_extent_hook)(struct inode *inode, - struct extent_state *new, - struct extent_state *other); - void (*split_extent_hook)(struct inode *inode, - struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page, void *data, - void (*flush_fn)(void *)); -}; - -struct extent_io_tree { - struct rb_root state; - struct radix_tree_root buffer; - struct address_space *mapping; - u64 dirty_bytes; - int track_uptodate; - spinlock_t lock; - spinlock_t buffer_lock; - struct extent_io_ops *ops; -}; - -struct extent_state { - u64 start; - u64 end; /* inclusive */ - struct rb_node rb_node; - - /* ADD NEW ELEMENTS AFTER THIS */ - struct extent_io_tree *tree; - wait_queue_head_t wq; - atomic_t refs; - unsigned long state; - - /* for use by the FS */ - u64 private; - - struct list_head leak_list; -}; - -#define INLINE_EXTENT_BUFFER_PAGES 16 -#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE) -struct extent_buffer { - u64 start; - unsigned long len; - unsigned long map_start; - unsigned long map_len; - unsigned long bflags; - struct extent_io_tree *tree; - spinlock_t refs_lock; - atomic_t refs; - atomic_t io_pages; - int read_mirror; - struct list_head leak_list; - struct rcu_head rcu_head; - pid_t lock_owner; - - /* count of read lock holders on the extent buffer */ - atomic_t write_locks; - atomic_t read_locks; - atomic_t blocking_writers; - atomic_t blocking_readers; - atomic_t spinning_readers; - atomic_t spinning_writers; - int lock_nested; - - /* protects write locks */ - rwlock_t lock; - - /* readers use lock_wq while they wait for the write - * lock holders to unlock - */ - wait_queue_head_t write_lock_wq; - - /* writers use read_lock_wq while they wait for readers - * to unlock - */ - wait_queue_head_t read_lock_wq; - wait_queue_head_t lock_wq; - struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; - struct page **pages; -}; - -static inline void extent_set_compress_type(unsigned long *bio_flags, - int compress_type) -{ - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; -} - -static inline int extent_compress_type(unsigned long bio_flags) -{ - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; -} - -struct extent_map_tree; - -typedef struct extent_map *(get_extent_t)(struct inode *inode, - struct page *page, - size_t pg_offset, - u64 start, u64 len, - int create); - -void extent_io_tree_init(struct extent_io_tree *tree, - struct address_space *mapping); -int try_release_extent_mapping(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); -int try_release_extent_buffer(struct page *page, gfp_t mask); -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); -int __init extent_io_init(void); -void extent_io_exit(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits, int contig); - -void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached_state); -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, struct extent_state **cached, - gfp_t mask); -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask); -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, gfp_t mask); -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits); -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); -int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, - struct writeback_control *wbc); -int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, - int mode); -int extent_writepages(struct extent_io_tree *tree, - struct address_space *mapping, - get_extent_t *get_extent, - struct writeback_control *wbc); -int btree_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc); -int extent_readpages(struct extent_io_tree *tree, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent); -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); -int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); -void set_page_extent_mapped(struct page *page); - -struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len); -struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, - u64 start, unsigned long len); -void free_extent_buffer(struct extent_buffer *eb); -void free_extent_buffer_stale(struct extent_buffer *eb); -#define WAIT_NONE 0 -#define WAIT_COMPLETE 1 -#define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, u64 start, int wait, - get_extent_t *get_extent, int mirror_num); -unsigned long num_extent_pages(u64 start, u64 len); -struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); - -static inline void extent_buffer_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->refs); -} - -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len); -void read_extent_buffer(struct extent_buffer *eb, void *dst, - unsigned long start, - unsigned long len); -void write_extent_buffer(struct extent_buffer *eb, const void *src, - unsigned long start, unsigned long len); -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, - unsigned long dst_offset, unsigned long src_offset, - unsigned long len); -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); -void clear_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_uptodate(struct extent_buffer *eb); -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **map, - unsigned long *map_start, - unsigned long *map_len); -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op); -struct bio * -btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags); - -struct btrfs_mapping_tree; - -int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, - u64 length, u64 logical, struct page *page, - int mirror_num); -int end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, - int mirror_num); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/extent_map.c b/ANDROID_3.4.5/fs/btrfs/extent_map.c deleted file mode 100644 index 7c97b330..00000000 --- a/ANDROID_3.4.5/fs/btrfs/extent_map.c +++ /dev/null @@ -1,363 +0,0 @@ -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/hardirq.h> -#include "ctree.h" -#include "extent_map.h" - - -static struct kmem_cache *extent_map_cache; - -int __init extent_map_init(void) -{ - extent_map_cache = kmem_cache_create("extent_map", - sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!extent_map_cache) - return -ENOMEM; - return 0; -} - -void extent_map_exit(void) -{ - if (extent_map_cache) - kmem_cache_destroy(extent_map_cache); -} - -/** - * extent_map_tree_init - initialize extent map tree - * @tree: tree to initialize - * - * Initialize the extent tree @tree. Should be called for each new inode - * or other user of the extent_map interface. - */ -void extent_map_tree_init(struct extent_map_tree *tree) -{ - tree->map = RB_ROOT; - rwlock_init(&tree->lock); -} - -/** - * alloc_extent_map - allocate new extent map structure - * - * Allocate a new extent_map structure. The new structure is - * returned with a reference count of one and needs to be - * freed using free_extent_map() - */ -struct extent_map *alloc_extent_map(void) -{ - struct extent_map *em; - em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); - if (!em) - return NULL; - em->in_tree = 0; - em->flags = 0; - em->compress_type = BTRFS_COMPRESS_NONE; - atomic_set(&em->refs, 1); - return em; -} - -/** - * free_extent_map - drop reference count of an extent_map - * @em: extent map beeing releasead - * - * Drops the reference out on @em by one and free the structure - * if the reference count hits zero. - */ -void free_extent_map(struct extent_map *em) -{ - if (!em) - return; - WARN_ON(atomic_read(&em->refs) == 0); - if (atomic_dec_and_test(&em->refs)) { - WARN_ON(em->in_tree); - kmem_cache_free(extent_map_cache, em); - } -} - -static struct rb_node *tree_insert(struct rb_root *root, u64 offset, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct extent_map *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct extent_map, rb_node); - - WARN_ON(!entry->in_tree); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset >= extent_map_end(entry)) - p = &(*p)->rb_right; - else - return parent; - } - - entry = rb_entry(node, struct extent_map, rb_node); - entry->in_tree = 1; - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -/* - * search through the tree for an extent_map with a given offset. If - * it can't be found, try to find some neighboring extents - */ -static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_ret, - struct rb_node **next_ret) -{ - struct rb_node *n = root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; - struct extent_map *entry; - struct extent_map *prev_entry = NULL; - - while (n) { - entry = rb_entry(n, struct extent_map, rb_node); - prev = n; - prev_entry = entry; - - WARN_ON(!entry->in_tree); - - if (offset < entry->start) - n = n->rb_left; - else if (offset >= extent_map_end(entry)) - n = n->rb_right; - else - return n; - } - - if (prev_ret) { - orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *prev_ret = prev; - prev = orig_prev; - } - - if (next_ret) { - prev_entry = rb_entry(prev, struct extent_map, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct extent_map, rb_node); - } - *next_ret = prev; - } - return NULL; -} - -/* check to see if two extent_map structs are adjacent and safe to merge */ -static int mergable_maps(struct extent_map *prev, struct extent_map *next) -{ - if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) - return 0; - - /* - * don't merge compressed extents, we need to know their - * actual size - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) - return 0; - - if (extent_map_end(prev) == next->start && - prev->flags == next->flags && - prev->bdev == next->bdev && - ((next->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (next->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start == EXTENT_MAP_DELALLOC && - prev->block_start == EXTENT_MAP_DELALLOC) || - (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && - next->block_start == extent_map_block_end(prev)))) { - return 1; - } - return 0; -} - -static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) -{ - struct extent_map *merge = NULL; - struct rb_node *rb; - - if (em->start != 0) { - rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { - em->start = merge->start; - em->len += merge->len; - em->block_len += merge->block_len; - em->block_start = merge->block_start; - merge->in_tree = 0; - rb_erase(&merge->rb_node, &tree->map); - free_extent_map(merge); - } - } - - rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { - em->len += merge->len; - em->block_len += merge->len; - rb_erase(&merge->rb_node, &tree->map); - merge->in_tree = 0; - free_extent_map(merge); - } -} - -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) -{ - int ret = 0; - struct extent_map *em; - - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); - - WARN_ON(!em || em->start != start); - - if (!em) - goto out; - - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - - try_merge_map(tree, em); - - free_extent_map(em); -out: - write_unlock(&tree->lock); - return ret; - -} - -/** - * add_extent_mapping - add new extent map to the extent tree - * @tree: tree to insert new map in - * @em: map to insert - * - * Insert @em into @tree or perform a simple forward/backward merge with - * existing mappings. The extent_map struct passed in will be inserted - * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successful. - */ -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em) -{ - int ret = 0; - struct rb_node *rb; - struct extent_map *exist; - - exist = lookup_extent_mapping(tree, em->start, em->len); - if (exist) { - free_extent_map(exist); - ret = -EEXIST; - goto out; - } - rb = tree_insert(&tree->map, em->start, &em->rb_node); - if (rb) { - ret = -EEXIST; - goto out; - } - atomic_inc(&em->refs); - - try_merge_map(tree, em); -out: - return ret; -} - -/* simple helper to do math around the end of an extent, handling wrap */ -static u64 range_end(u64 start, u64 len) -{ - if (start + len < start) - return (u64)-1; - return start + len; -} - -struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) -{ - struct extent_map *em; - struct rb_node *rb_node; - struct rb_node *prev = NULL; - struct rb_node *next = NULL; - u64 end = range_end(start, len); - - rb_node = __tree_search(&tree->map, start, &prev, &next); - if (!rb_node) { - if (prev) - rb_node = prev; - else if (next) - rb_node = next; - else - return NULL; - } - - em = rb_entry(rb_node, struct extent_map, rb_node); - - if (strict && !(end > em->start && start < extent_map_end(em))) - return NULL; - - atomic_inc(&em->refs); - return em; -} - -/** - * lookup_extent_mapping - lookup extent_map - * @tree: tree to lookup in - * @start: byte offset to start the search - * @len: length of the lookup range - * - * Find and return the first extent_map struct in @tree that intersects the - * [start, len] range. There may be additional objects in the tree that - * intersect, so check the object returned carefully to make sure that no - * additional lookups are needed. - */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) -{ - return __lookup_extent_mapping(tree, start, len, 1); -} - -/** - * search_extent_mapping - find a nearby extent map - * @tree: tree to lookup in - * @start: byte offset to start the search - * @len: length of the lookup range - * - * Find and return the first extent_map struct in @tree that intersects the - * [start, len] range. - * - * If one can't be found, any nearby extent may be returned - */ -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) -{ - return __lookup_extent_mapping(tree, start, len, 0); -} - -/** - * remove_extent_mapping - removes an extent_map from the extent tree - * @tree: extent tree to remove from - * @em: extent map beeing removed - * - * Removes @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use - */ -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) -{ - int ret = 0; - - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - rb_erase(&em->rb_node, &tree->map); - em->in_tree = 0; - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/extent_map.h b/ANDROID_3.4.5/fs/btrfs/extent_map.h deleted file mode 100644 index 1195f097..00000000 --- a/ANDROID_3.4.5/fs/btrfs/extent_map.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef __EXTENTMAP__ -#define __EXTENTMAP__ - -#include <linux/rbtree.h> - -#define EXTENT_MAP_LAST_BYTE (u64)-4 -#define EXTENT_MAP_HOLE (u64)-3 -#define EXTENT_MAP_INLINE (u64)-2 -#define EXTENT_MAP_DELALLOC (u64)-1 - -/* bits for the flags field */ -#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ -#define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ -#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ - -struct extent_map { - struct rb_node rb_node; - - /* all of these are in bytes */ - u64 start; - u64 len; - u64 orig_start; - u64 block_start; - u64 block_len; - unsigned long flags; - struct block_device *bdev; - atomic_t refs; - unsigned int in_tree; - unsigned int compress_type; -}; - -struct extent_map_tree { - struct rb_root map; - rwlock_t lock; -}; - -static inline u64 extent_map_end(struct extent_map *em) -{ - if (em->start + em->len < em->start) - return (u64)-1; - return em->start + em->len; -} - -static inline u64 extent_map_block_end(struct extent_map *em) -{ - if (em->block_start + em->block_len < em->block_start) - return (u64)-1; - return em->block_start + em->block_len; -} - -void extent_map_tree_init(struct extent_map_tree *tree); -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em); -int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); - -struct extent_map *alloc_extent_map(void); -void free_extent_map(struct extent_map *em); -int __init extent_map_init(void); -void extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/file-item.c b/ANDROID_3.4.5/fs/btrfs/file-item.c deleted file mode 100644 index 5d158d32..00000000 --- a/ANDROID_3.4.5/fs/btrfs/file-item.c +++ /dev/null @@ -1,861 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "print-tree.h" - -#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) * 2) / \ - size) - 1)) - -#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE)) - -#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ - sizeof(struct btrfs_ordered_sum)) / \ - sizeof(struct btrfs_sector_sum) * \ - (r)->sectorsize - (r)->sectorsize) - -int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, - u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset, u64 ram_bytes, - u8 compression, u8 encryption, u16 other_encoding) -{ - int ret = 0; - struct btrfs_file_extent_item *item; - struct btrfs_key file_key; - struct btrfs_path *path; - struct extent_buffer *leaf; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - file_key.objectid = objectid; - file_key.offset = pos; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); - - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, root, path, &file_key, - sizeof(*item)); - if (ret < 0) - goto out; - BUG_ON(ret); /* Can't happen */ - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); - btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, item, offset); - btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); - btrfs_set_file_extent_generation(leaf, item, trans->transid); - btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, compression); - btrfs_set_file_extent_encryption(leaf, item, encryption); - btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); - - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return ret; -} - -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow) -{ - int ret; - struct btrfs_key file_key; - struct btrfs_key found_key; - struct btrfs_csum_item *item; - struct extent_buffer *leaf; - u64 csum_offset = 0; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int csums_in_item; - - file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); - ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); - if (ret < 0) - goto fail; - leaf = path->nodes[0]; - if (ret > 0) { - ret = 1; - if (path->slots[0] == 0) - goto fail; - path->slots[0]--; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) - goto fail; - - csum_offset = (bytenr - found_key.offset) >> - root->fs_info->sb->s_blocksize_bits; - csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); - csums_in_item /= csum_size; - - if (csum_offset >= csums_in_item) { - ret = -EFBIG; - goto fail; - } - } - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - item = (struct btrfs_csum_item *)((unsigned char *)item + - csum_offset * csum_size); - return item; -fail: - if (ret > 0) - ret = -ENOENT; - return ERR_PTR(ret); -} - - -int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - u64 offset, int mod) -{ - int ret; - struct btrfs_key file_key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - - file_key.objectid = objectid; - file_key.offset = offset; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); - ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); - return ret; -} - - -static int __btrfs_lookup_bio_sums(struct btrfs_root *root, - struct inode *inode, struct bio *bio, - u64 logical_offset, u32 *dst, int dio) -{ - u32 sum; - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; - u64 offset = 0; - u64 item_start_offset = 0; - u64 item_last_offset = 0; - u64 disk_bytenr; - u32 diff; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int ret; - struct btrfs_path *path; - struct btrfs_csum_item *item = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (bio->bi_size > PAGE_CACHE_SIZE * 8) - path->reada = 2; - - WARN_ON(bio->bi_vcnt <= 0); - - /* - * the free space stuff is only read when it hasn't been - * updated in the current transaction. So, we can safely - * read from the commit root and sidestep a nasty deadlock - * between reading the free space cache and updating the csum tree. - */ - if (btrfs_is_free_space_inode(root, inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; - } - - disk_bytenr = (u64)bio->bi_sector << 9; - if (dio) - offset = logical_offset; - while (bio_index < bio->bi_vcnt) { - if (!dio) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; - ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); - if (ret == 0) - goto found; - - if (!item || disk_bytenr < item_start_offset || - disk_bytenr >= item_last_offset) { - struct btrfs_key found_key; - u32 item_size; - - if (item) - btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, - path, disk_bytenr, 0); - if (IS_ERR(item)) { - ret = PTR_ERR(item); - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - sum = 0; - if (BTRFS_I(inode)->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { - set_extent_bits(io_tree, offset, - offset + bvec->bv_len - 1, - EXTENT_NODATASUM, GFP_NOFS); - } else { - printk(KERN_INFO "btrfs no csum found " - "for inode %llu start %llu\n", - (unsigned long long) - btrfs_ino(inode), - (unsigned long long)offset); - } - item = NULL; - btrfs_release_path(path); - goto found; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - - item_start_offset = found_key.offset; - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - item_last_offset = item_start_offset + - (item_size / csum_size) * - root->sectorsize; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_csum_item); - } - /* - * this byte range must be able to fit inside - * a single leaf so it will also fit inside a u32 - */ - diff = disk_bytenr - item_start_offset; - diff = diff / root->sectorsize; - diff = diff * csum_size; - - read_extent_buffer(path->nodes[0], &sum, - ((unsigned long)item) + diff, - csum_size); -found: - if (dst) - *dst++ = sum; - else - set_state_private(io_tree, offset, sum); - disk_bytenr += bvec->bv_len; - offset += bvec->bv_len; - bio_index++; - bvec++; - } - btrfs_free_path(path); - return 0; -} - -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) -{ - return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); -} - -int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 offset, u32 *dst) -{ - return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); -} - -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_csum_item *item; - LIST_HEAD(tmplist); - unsigned long offset; - int ret; - size_t size; - u64 csum_end; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - if (search_commit) { - path->skip_locking = 1; - path->reada = 2; - path->search_commit_root = 1; - } - - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; - key.type = BTRFS_EXTENT_CSUM_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto fail; - if (ret > 0 && path->slots[0] > 0) { - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); - if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && - key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> - root->fs_info->sb->s_blocksize_bits; - if (offset * csum_size < - btrfs_item_size_nr(leaf, path->slots[0] - 1)) - path->slots[0]--; - } - } - - while (start <= end) { - leaf = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto fail; - if (ret > 0) - break; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - key.type != BTRFS_EXTENT_CSUM_KEY) - break; - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.offset > end) - break; - - if (key.offset > start) - start = key.offset; - - size = btrfs_item_size_nr(leaf, path->slots[0]); - csum_end = key.offset + (size / csum_size) * root->sectorsize; - if (csum_end <= start) { - path->slots[0]++; - continue; - } - - csum_end = min(csum_end, end + 1); - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_csum_item); - while (start < csum_end) { - size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); - sums = kzalloc(btrfs_ordered_sum_size(root, size), - GFP_NOFS); - if (!sums) { - ret = -ENOMEM; - goto fail; - } - - sector_sum = sums->sums; - sums->bytenr = start; - sums->len = size; - - offset = (start - key.offset) >> - root->fs_info->sb->s_blocksize_bits; - offset *= csum_size; - - while (size > 0) { - read_extent_buffer(path->nodes[0], - §or_sum->sum, - ((unsigned long)item) + - offset, csum_size); - sector_sum->bytenr = start; - - size -= root->sectorsize; - start += root->sectorsize; - offset += csum_size; - sector_sum++; - } - list_add_tail(&sums->list, &tmplist); - } - path->slots[0]++; - } - ret = 0; -fail: - while (ret < 0 && !list_empty(&tmplist)) { - sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - list_splice_tail(&tmplist, list); - - btrfs_free_path(path); - return ret; -} - -int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 file_start, int contig) -{ - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - char *data; - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; - unsigned long total_bytes = 0; - unsigned long this_sum_bytes = 0; - u64 offset; - u64 disk_bytenr; - - WARN_ON(bio->bi_vcnt <= 0); - sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); - if (!sums) - return -ENOMEM; - - sector_sum = sums->sums; - disk_bytenr = (u64)bio->bi_sector << 9; - sums->len = bio->bi_size; - INIT_LIST_HEAD(&sums->list); - - if (contig) - offset = file_start; - else - offset = page_offset(bvec->bv_page) + bvec->bv_offset; - - ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; - - while (bio_index < bio->bi_vcnt) { - if (!contig) - offset = page_offset(bvec->bv_page) + bvec->bv_offset; - - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { - unsigned long bytes_left; - sums->len = this_sum_bytes; - this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); - btrfs_put_ordered_extent(ordered); - - bytes_left = bio->bi_size - total_bytes; - - sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), - GFP_NOFS); - BUG_ON(!sums); /* -ENOMEM */ - sector_sum = sums->sums; - sums->len = bytes_left; - ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ - sums->bytenr = ordered->start; - } - - data = kmap_atomic(bvec->bv_page); - sector_sum->sum = ~(u32)0; - sector_sum->sum = btrfs_csum_data(root, - data + bvec->bv_offset, - sector_sum->sum, - bvec->bv_len); - kunmap_atomic(data); - btrfs_csum_final(sector_sum->sum, - (char *)§or_sum->sum); - sector_sum->bytenr = disk_bytenr; - - sector_sum++; - bio_index++; - total_bytes += bvec->bv_len; - this_sum_bytes += bvec->bv_len; - disk_bytenr += bvec->bv_len; - offset += bvec->bv_len; - bvec++; - } - this_sum_bytes = 0; - btrfs_add_ordered_sum(inode, ordered, sums); - btrfs_put_ordered_extent(ordered); - return 0; -} - -/* - * helper function for csum removal, this expects the - * key to describe the csum pointed to by the path, and it expects - * the csum to overlap the range [bytenr, len] - * - * The csum should not be entirely contained in the range and the - * range should not be entirely contained in the csum. - * - * This calls btrfs_truncate_item with the correct args based on the - * overlap, and fixes up the key as required. - */ -static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 bytenr, u64 len) -{ - struct extent_buffer *leaf; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - u64 csum_end; - u64 end_byte = bytenr + len; - u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; - - leaf = path->nodes[0]; - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; - csum_end <<= root->fs_info->sb->s_blocksize_bits; - csum_end += key->offset; - - if (key->offset < bytenr && csum_end <= end_byte) { - /* - * [ bytenr - len ] - * [ ] - * [csum ] - * A simple truncate off the end of the item - */ - u32 new_size = (bytenr - key->offset) >> blocksize_bits; - new_size *= csum_size; - btrfs_truncate_item(trans, root, path, new_size, 1); - } else if (key->offset >= bytenr && csum_end > end_byte && - end_byte > key->offset) { - /* - * [ bytenr - len ] - * [ ] - * [csum ] - * we need to truncate from the beginning of the csum - */ - u32 new_size = (csum_end - end_byte) >> blocksize_bits; - new_size *= csum_size; - - btrfs_truncate_item(trans, root, path, new_size, 0); - - key->offset = end_byte; - btrfs_set_item_key_safe(trans, root, path, key); - } else { - BUG(); - } -} - -/* - * deletes the csum items from the csum tree for a given - * range of bytes. - */ -int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len) -{ - struct btrfs_path *path; - struct btrfs_key key; - u64 end_byte = bytenr + len; - u64 csum_end; - struct extent_buffer *leaf; - int ret; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int blocksize_bits = root->fs_info->sb->s_blocksize_bits; - - root = root->fs_info->csum_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - while (1) { - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; - key.type = BTRFS_EXTENT_CSUM_KEY; - - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } else if (ret < 0) { - break; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - key.type != BTRFS_EXTENT_CSUM_KEY) { - break; - } - - if (key.offset >= end_byte) - break; - - csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; - csum_end <<= blocksize_bits; - csum_end += key.offset; - - /* this csum ends before we start, we're done */ - if (csum_end <= bytenr) - break; - - /* delete the entire item, it is inside our range */ - if (key.offset >= bytenr && csum_end <= end_byte) { - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; - if (key.offset == bytenr) - break; - } else if (key.offset < bytenr && csum_end > end_byte) { - unsigned long offset; - unsigned long shift_len; - unsigned long item_offset; - /* - * [ bytenr - len ] - * [csum ] - * - * Our bytes are in the middle of the csum, - * we need to split this item and insert a new one. - * - * But we can't drop the path because the - * csum could change, get removed, extended etc. - * - * The trick here is the max size of a csum item leaves - * enough room in the tree block for a single - * item header. So, we split the item in place, - * adding a new header pointing to the existing - * bytes. Then we loop around again and we have - * a nicely formed csum item that we can neatly - * truncate. - */ - offset = (bytenr - key.offset) >> blocksize_bits; - offset *= csum_size; - - shift_len = (len >> blocksize_bits) * csum_size; - - item_offset = btrfs_item_ptr_offset(leaf, - path->slots[0]); - - memset_extent_buffer(leaf, 0, item_offset + offset, - shift_len); - key.offset = bytenr; - - /* - * btrfs_split_item returns -EAGAIN when the - * item changed size or key - */ - ret = btrfs_split_item(trans, root, path, &key, offset); - if (ret && ret != -EAGAIN) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - key.offset = end_byte - 1; - } else { - truncate_one_csum(trans, root, path, &key, bytenr, len); - if (key.offset < bytenr) - break; - } - btrfs_release_path(path); - } - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums) -{ - u64 bytenr; - int ret; - struct btrfs_key file_key; - struct btrfs_key found_key; - u64 next_offset; - u64 total_bytes = 0; - int found_next; - struct btrfs_path *path; - struct btrfs_csum_item *item; - struct btrfs_csum_item *item_end; - struct extent_buffer *leaf = NULL; - u64 csum_offset; - struct btrfs_sector_sum *sector_sum; - u32 nritems; - u32 ins_size; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - sector_sum = sums->sums; -again: - next_offset = (u64)-1; - found_next = 0; - file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = sector_sum->bytenr; - bytenr = sector_sum->bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); - - item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); - if (!IS_ERR(item)) { - leaf = path->nodes[0]; - ret = 0; - goto found; - } - ret = PTR_ERR(item); - if (ret != -EFBIG && ret != -ENOENT) - goto fail_unlock; - - if (ret == -EFBIG) { - u32 item_size; - /* we found one, but it isn't big enough yet */ - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if ((item_size / csum_size) >= - MAX_CSUM_ITEMS(root, csum_size)) { - /* already at max size, make a new one */ - goto insert; - } - } else { - int slot = path->slots[0] + 1; - /* we didn't find a csum item, insert one */ - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems - 1) { - ret = btrfs_next_leaf(root, path); - if (ret == 1) - found_next = 1; - if (ret != 0) - goto insert; - slot = 0; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); - if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - found_key.type != BTRFS_EXTENT_CSUM_KEY) { - found_next = 1; - goto insert; - } - next_offset = found_key.offset; - found_next = 1; - goto insert; - } - - /* - * at this point, we know the tree has an item, but it isn't big - * enough yet to put our csum in. Grow it - */ - btrfs_release_path(path); - ret = btrfs_search_slot(trans, root, &file_key, path, - csum_size, 1); - if (ret < 0) - goto fail_unlock; - - if (ret > 0) { - if (path->slots[0] == 0) - goto insert; - path->slots[0]--; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - csum_offset = (bytenr - found_key.offset) >> - root->fs_info->sb->s_blocksize_bits; - - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || - found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { - goto insert; - } - - if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / - csum_size) { - u32 diff = (csum_offset + 1) * csum_size; - - /* - * is the item big enough already? we dropped our lock - * before and need to recheck - */ - if (diff < btrfs_item_size_nr(leaf, path->slots[0])) - goto csum; - - diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); - if (diff != csum_size) - goto insert; - - btrfs_extend_item(trans, root, path, diff); - goto csum; - } - -insert: - btrfs_release_path(path); - csum_offset = 0; - if (found_next) { - u64 tmp = total_bytes + root->sectorsize; - u64 next_sector = sector_sum->bytenr; - struct btrfs_sector_sum *next = sector_sum + 1; - - while (tmp < sums->len) { - if (next_sector + root->sectorsize != next->bytenr) - break; - tmp += root->sectorsize; - next_sector = next->bytenr; - next++; - } - tmp = min(tmp, next_offset - file_key.offset); - tmp >>= root->fs_info->sb->s_blocksize_bits; - tmp = max((u64)1, tmp); - tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); - ins_size = csum_size * tmp; - } else { - ins_size = csum_size; - } - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, root, path, &file_key, - ins_size); - path->leave_spinning = 0; - if (ret < 0) - goto fail_unlock; - if (ret != 0) { - WARN_ON(1); - goto fail_unlock; - } -csum: - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - ret = 0; - item = (struct btrfs_csum_item *)((unsigned char *)item + - csum_offset * csum_size); -found: - item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); - item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + - btrfs_item_size_nr(leaf, path->slots[0])); -next_sector: - - write_extent_buffer(leaf, §or_sum->sum, (unsigned long)item, csum_size); - - total_bytes += root->sectorsize; - sector_sum++; - if (total_bytes < sums->len) { - item = (struct btrfs_csum_item *)((char *)item + - csum_size); - if (item < item_end && bytenr + PAGE_CACHE_SIZE == - sector_sum->bytenr) { - bytenr = sector_sum->bytenr; - goto next_sector; - } - } - - btrfs_mark_buffer_dirty(path->nodes[0]); - if (total_bytes < sums->len) { - btrfs_release_path(path); - cond_resched(); - goto again; - } -out: - btrfs_free_path(path); - return ret; - -fail_unlock: - goto out; -} diff --git a/ANDROID_3.4.5/fs/btrfs/file.c b/ANDROID_3.4.5/fs/btrfs/file.c deleted file mode 100644 index 53bf2d76..00000000 --- a/ANDROID_3.4.5/fs/btrfs/file.c +++ /dev/null @@ -1,1908 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/backing-dev.h> -#include <linux/mpage.h> -#include <linux/falloc.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/statfs.h> -#include <linux/compat.h> -#include <linux/slab.h> -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "ioctl.h" -#include "print-tree.h" -#include "tree-log.h" -#include "locking.h" -#include "compat.h" - -/* - * when auto defrag is enabled we - * queue up these defrag structs to remember which - * inodes need defragging passes - */ -struct inode_defrag { - struct rb_node rb_node; - /* objectid */ - u64 ino; - /* - * transid where the defrag was added, we search for - * extents newer than this - */ - u64 transid; - - /* root objectid */ - u64 root; - - /* last offset we were able to defrag */ - u64 last_offset; - - /* if we've wrapped around back to zero once already */ - int cycled; -}; - -/* pop a record for an inode into the defrag tree. The lock - * must be held already - * - * If you're inserting a record for an older transid than an - * existing record, the transid already in the tree is lowered - * - * If an existing record is found the defrag item you - * pass in is freed - */ -static void __btrfs_add_inode_defrag(struct inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - - p = &root->fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - if (defrag->ino < entry->ino) - p = &parent->rb_left; - else if (defrag->ino > entry->ino) - p = &parent->rb_right; - else { - /* if we're reinserting an entry for - * an old defrag run, make sure to - * lower the transid of our existing record - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - if (defrag->last_offset > entry->last_offset) - entry->last_offset = defrag->last_offset; - goto exists; - } - } - BTRFS_I(inode)->in_defrag = 1; - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); - return; - -exists: - kfree(defrag); - return; - -} - -/* - * insert a defrag record for this inode if auto defrag is - * enabled - */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct inode_defrag *defrag; - u64 transid; - - if (!btrfs_test_opt(root, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(root->fs_info)) - return 0; - - if (BTRFS_I(inode)->in_defrag) - return 0; - - if (trans) - transid = trans->transid; - else - transid = BTRFS_I(inode)->root->last_trans; - - defrag = kzalloc(sizeof(*defrag), GFP_NOFS); - if (!defrag) - return -ENOMEM; - - defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; - - spin_lock(&root->fs_info->defrag_inodes_lock); - if (!BTRFS_I(inode)->in_defrag) - __btrfs_add_inode_defrag(inode, defrag); - else - kfree(defrag); - spin_unlock(&root->fs_info->defrag_inodes_lock); - return 0; -} - -/* - * must be called with the defrag_inodes lock held - */ -struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino, - struct rb_node **next) -{ - struct inode_defrag *entry = NULL; - struct rb_node *p; - struct rb_node *parent = NULL; - - p = info->defrag_inodes.rb_node; - while (p) { - parent = p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - if (ino < entry->ino) - p = parent->rb_left; - else if (ino > entry->ino) - p = parent->rb_right; - else - return entry; - } - - if (next) { - while (parent && ino > entry->ino) { - parent = rb_next(parent); - entry = rb_entry(parent, struct inode_defrag, rb_node); - } - *next = parent; - } - return NULL; -} - -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - struct btrfs_root *inode_root; - struct inode *inode; - struct rb_node *n; - struct btrfs_key key; - struct btrfs_ioctl_defrag_range_args range; - u64 first_ino = 0; - int num_defrag; - int defrag_batch = 1024; - - memset(&range, 0, sizeof(range)); - range.len = (u64)-1; - - atomic_inc(&fs_info->defrag_running); - spin_lock(&fs_info->defrag_inodes_lock); - while(1) { - n = NULL; - - /* find an inode to defrag */ - defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n); - if (!defrag) { - if (n) - defrag = rb_entry(n, struct inode_defrag, rb_node); - else if (first_ino) { - first_ino = 0; - continue; - } else { - break; - } - } - - /* remove it from the rbtree */ - first_ino = defrag->ino + 1; - rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - - if (btrfs_fs_closing(fs_info)) - goto next_free; - - spin_unlock(&fs_info->defrag_inodes_lock); - - /* get the inode */ - key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = (u64)-1; - inode_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(inode_root)) - goto next; - - key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); - if (IS_ERR(inode)) - goto next; - - /* do a chunk of defrag */ - BTRFS_I(inode)->in_defrag = 0; - range.start = defrag->last_offset; - num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - defrag_batch); - /* - * if we filled the whole defrag batch, there - * must be more work to do. Queue this defrag - * again - */ - if (num_defrag == defrag_batch) { - defrag->last_offset = range.start; - __btrfs_add_inode_defrag(inode, defrag); - /* - * we don't want to kfree defrag, we added it back to - * the rbtree - */ - defrag = NULL; - } else if (defrag->last_offset && !defrag->cycled) { - /* - * we didn't fill our defrag batch, but - * we didn't start at zero. Make sure we loop - * around to the start of the file. - */ - defrag->last_offset = 0; - defrag->cycled = 1; - __btrfs_add_inode_defrag(inode, defrag); - defrag = NULL; - } - - iput(inode); -next: - spin_lock(&fs_info->defrag_inodes_lock); -next_free: - kfree(defrag); - } - spin_unlock(&fs_info->defrag_inodes_lock); - - atomic_dec(&fs_info->defrag_running); - - /* - * during unmount, we use the transaction_wait queue to - * wait for the defragger to stop - */ - wake_up(&fs_info->transaction_wait); - return 0; -} - -/* simple helper to fault in pages and copy. This should go away - * and be replaced with calls into generic code. - */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - size_t write_bytes, - struct page **prepared_pages, - struct iov_iter *i) -{ - size_t copied = 0; - size_t total_copied = 0; - int pg = 0; - int offset = pos & (PAGE_CACHE_SIZE - 1); - - while (write_bytes > 0) { - size_t count = min_t(size_t, - PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[pg]; - /* - * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked - */ - pagefault_disable(); - copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); - - /* Flush processor's dcache for this page */ - flush_dcache_page(page); - - /* - * if we get a partial write, we can end up with - * partially up to date pages. These add - * a lot of complexity, so make sure they don't - * happen by forcing this copy to be retried. - * - * The rest of the btrfs_file_write code will fall - * back to page at a time copies after we return 0. - */ - if (!PageUptodate(page) && copied < count) - copied = 0; - - iov_iter_advance(i, copied); - write_bytes -= copied; - total_copied += copied; - - /* Return to btrfs_file_aio_write to fault page */ - if (unlikely(copied == 0)) - break; - - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { - offset += copied; - } else { - pg++; - offset = 0; - } - } - return total_copied; -} - -/* - * unlocks pages after btrfs_file_write is done with them - */ -void btrfs_drop_pages(struct page **pages, size_t num_pages) -{ - size_t i; - for (i = 0; i < num_pages; i++) { - /* page checked is some magic around finding pages that - * have been modified without going through btrfs_set_page_dirty - * clear it here - */ - ClearPageChecked(pages[i]); - unlock_page(pages[i]); - mark_page_accessed(pages[i]); - page_cache_release(pages[i]); - } -} - -/* - * after copy_from_user, pages need to be dirtied and we need to make - * sure holes are created between the current EOF and the start of - * any next extents (if required). - * - * this also makes the decision about creating an inline extent vs - * doing real data extents, marking pages dirty and delalloc as required. - */ -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, - struct page **pages, size_t num_pages, - loff_t pos, size_t write_bytes, - struct extent_state **cached) -{ - int err = 0; - int i; - u64 num_bytes; - u64 start_pos; - u64 end_of_last_block; - u64 end_pos = pos + write_bytes; - loff_t isize = i_size_read(inode); - - start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = (write_bytes + pos - start_pos + - root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - - end_of_last_block = start_pos + num_bytes - 1; - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - cached); - if (err) - return err; - - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - SetPageUptodate(p); - ClearPageChecked(p); - set_page_dirty(p); - } - - /* - * we've only changed i_size in ram, and we haven't updated - * the disk i_size. There is no need to log the inode - * at this time. - */ - if (end_pos > isize) - i_size_write(inode, end_pos); - return 0; -} - -/* - * this drops all the extents in the cache that intersect the range - * [start, end]. Existing extents are split as required. - */ -int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, - int skip_pinned) -{ - struct extent_map *em; - struct extent_map *split = NULL; - struct extent_map *split2 = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 len = end - start + 1; - int ret; - int testend = 1; - unsigned long flags; - int compressed = 0; - - WARN_ON(end < start); - if (end == (u64)-1) { - len = (u64)-1; - testend = 0; - } - while (1) { - if (!split) - split = alloc_extent_map(); - if (!split2) - split2 = alloc_extent_map(); - BUG_ON(!split || !split2); /* -ENOMEM */ - - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - write_unlock(&em_tree->lock); - break; - } - flags = em->flags; - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (testend && em->start + em->len >= start + len) { - free_extent_map(em); - write_unlock(&em_tree->lock); - break; - } - start = em->start + em->len; - if (testend) - len = start + len - (em->start + em->len); - free_extent_map(em); - write_unlock(&em_tree->lock); - continue; - } - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - remove_extent_mapping(em_tree, em); - - if (em->block_start < EXTENT_MAP_LAST_BYTE && - em->start < start) { - split->start = em->start; - split->len = start - em->start; - split->orig_start = em->orig_start; - split->block_start = em->block_start; - - if (compressed) - split->block_len = em->block_len; - else - split->block_len = split->len; - - split->bdev = em->bdev; - split->flags = flags; - split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); /* Logic error */ - free_extent_map(split); - split = split2; - split2 = NULL; - } - if (em->block_start < EXTENT_MAP_LAST_BYTE && - testend && em->start + em->len > start + len) { - u64 diff = start + len - em->start; - - split->start = start + len; - split->len = em->start + em->len - (start + len); - split->bdev = em->bdev; - split->flags = flags; - split->compress_type = em->compress_type; - - if (compressed) { - split->block_len = em->block_len; - split->block_start = em->block_start; - split->orig_start = em->orig_start; - } else { - split->block_len = split->len; - split->block_start = em->block_start + diff; - split->orig_start = split->start; - } - - ret = add_extent_mapping(em_tree, split); - BUG_ON(ret); /* Logic error */ - free_extent_map(split); - split = NULL; - } - write_unlock(&em_tree->lock); - - /* once for us */ - free_extent_map(em); - /* once for the tree*/ - free_extent_map(em); - } - if (split) - free_extent_map(split); - if (split2) - free_extent_map(split2); - return 0; -} - -/* - * this is very complex, but the basic idea is to drop all extents - * in the range start - end. hint_block is filled in with a block number - * that would be a good hint to the block allocator for this file. - * - * If an extent intersects the range but is not entirely inside the range - * it is either truncated or split. Anything entirely inside the range - * is deleted from the tree. - */ -int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, - u64 start, u64 end, u64 *hint_byte, int drop_cache) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_key new_key; - u64 ino = btrfs_ino(inode); - u64 search_start = start; - u64 disk_bytenr = 0; - u64 num_bytes = 0; - u64 extent_offset = 0; - u64 extent_end = 0; - int del_nr = 0; - int del_slot = 0; - int extent_type; - int recow; - int ret; - int modify_tree = -1; - - if (drop_cache) - btrfs_drop_extent_cache(inode, start, end - 1, 0); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - if (start >= BTRFS_I(inode)->disk_i_size) - modify_tree = 0; - - while (1) { - recow = 0; - ret = btrfs_lookup_file_extent(trans, root, path, ino, - search_start, modify_tree); - if (ret < 0) - break; - if (ret > 0 && path->slots[0] > 0 && search_start == start) { - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); - if (key.objectid == ino && - key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - ret = 0; -next_slot: - leaf = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - BUG_ON(del_nr > 0); - ret = btrfs_next_leaf(root, path); - if (ret < 0) - break; - if (ret > 0) { - ret = 0; - break; - } - leaf = path->nodes[0]; - recow = 1; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid > ino || - key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) - break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); - } else { - WARN_ON(1); - extent_end = search_start; - } - - if (extent_end <= search_start) { - path->slots[0]++; - goto next_slot; - } - - search_start = max(key.offset, start); - if (recow || !modify_tree) { - modify_tree = -1; - btrfs_release_path(path); - continue; - } - - /* - * | - range to drop - | - * | -------- extent -------- | - */ - if (start > key.offset && end < extent_end) { - BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = start; - ret = btrfs_duplicate_item(trans, root, path, - &new_key); - if (ret == -EAGAIN) { - btrfs_release_path(path); - continue; - } - if (ret < 0) - break; - - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0] - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - extent_offset += start - key.offset; - btrfs_set_file_extent_offset(leaf, fi, extent_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - start); - btrfs_mark_buffer_dirty(leaf); - - if (disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, root, - disk_bytenr, num_bytes, 0, - root->root_key.objectid, - new_key.objectid, - start - extent_offset, 0); - BUG_ON(ret); /* -ENOMEM */ - *hint_byte = disk_bytenr; - } - key.offset = start; - } - /* - * | ---- range to drop ----- | - * | -------- extent -------- | - */ - if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); - - extent_offset += end - key.offset; - btrfs_set_file_extent_offset(leaf, fi, extent_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - end); - btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { - inode_sub_bytes(inode, end - key.offset); - *hint_byte = disk_bytenr; - } - break; - } - - search_start = extent_end; - /* - * | ---- range to drop ----- | - * | -------- extent -------- | - */ - if (start > key.offset && end >= extent_end) { - BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - - btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); - btrfs_mark_buffer_dirty(leaf); - if (disk_bytenr > 0) { - inode_sub_bytes(inode, extent_end - start); - *hint_byte = disk_bytenr; - } - if (end == extent_end) - break; - - path->slots[0]++; - goto next_slot; - } - - /* - * | ---- range to drop ----- | - * | ------ extent ------ | - */ - if (start <= key.offset && end >= extent_end) { - if (del_nr == 0) { - del_slot = path->slots[0]; - del_nr = 1; - } else { - BUG_ON(del_slot + del_nr != path->slots[0]); - del_nr++; - } - - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - inode_sub_bytes(inode, - extent_end - key.offset); - extent_end = ALIGN(extent_end, - root->sectorsize); - } else if (disk_bytenr > 0) { - ret = btrfs_free_extent(trans, root, - disk_bytenr, num_bytes, 0, - root->root_key.objectid, - key.objectid, key.offset - - extent_offset, 0); - BUG_ON(ret); /* -ENOMEM */ - inode_sub_bytes(inode, - extent_end - key.offset); - *hint_byte = disk_bytenr; - } - - if (end == extent_end) - break; - - if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { - path->slots[0]++; - goto next_slot; - } - - ret = btrfs_del_items(trans, root, path, del_slot, - del_nr); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - del_nr = 0; - del_slot = 0; - - btrfs_release_path(path); - continue; - } - - BUG_ON(1); - } - - if (!ret && del_nr > 0) { - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret) - btrfs_abort_transaction(trans, root, ret); - } - -out: - btrfs_free_path(path); - return ret; -} - -static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 orig_offset, - u64 *start, u64 *end) -{ - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - u64 extent_end; - - if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || - btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || - btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || - btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - return 0; - - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if ((*start && *start != key.offset) || (*end && *end != extent_end)) - return 0; - - *start = key.offset; - *end = extent_end; - return 1; -} - -/* - * Mark extent in the range start - end as written. - * - * This changes extent type from 'pre-allocated' to 'regular'. If only - * part of extent is marked as written, the extent will be split into - * two or three. - */ -int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_buffer *leaf; - struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - struct btrfs_key new_key; - u64 bytenr; - u64 num_bytes; - u64 extent_end; - u64 orig_offset; - u64 other_start; - u64 other_end; - u64 split; - int del_nr = 0; - int del_slot = 0; - int recow; - int ret; - u64 ino = btrfs_ino(inode); - - btrfs_drop_extent_cache(inode, start, end - 1, 0); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; -again: - recow = 0; - split = start; - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = split; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0 && path->slots[0] > 0) - path->slots[0]--; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY); - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - BUG_ON(btrfs_file_extent_type(leaf, fi) != - BTRFS_FILE_EXTENT_PREALLOC); - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - BUG_ON(key.offset > start || extent_end < end); - - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); - memcpy(&new_key, &key, sizeof(new_key)); - - if (start == key.offset && end < extent_end) { - other_start = 0; - other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1, - ino, bytenr, orig_offset, - &other_start, &other_end)) { - new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - end); - btrfs_set_file_extent_offset(leaf, fi, - end - orig_offset); - fi = btrfs_item_ptr(leaf, path->slots[0] - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, - end - other_start); - btrfs_mark_buffer_dirty(leaf); - goto out; - } - } - - if (start > key.offset && end == extent_end) { - other_start = end; - other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, - ino, bytenr, orig_offset, - &other_start, &other_end)) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, - start - key.offset); - path->slots[0]++; - new_key.offset = start; - btrfs_set_item_key_safe(trans, root, path, &new_key); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, - other_end - start); - btrfs_set_file_extent_offset(leaf, fi, - start - orig_offset); - btrfs_mark_buffer_dirty(leaf); - goto out; - } - } - - while (start > key.offset || end < extent_end) { - if (key.offset == start) - split = end; - - new_key.offset = split; - ret = btrfs_duplicate_item(trans, root, path, &new_key); - if (ret == -EAGAIN) { - btrfs_release_path(path); - goto again; - } - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0] - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, - split - key.offset); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - split); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, - root->root_key.objectid, - ino, orig_offset, 0); - BUG_ON(ret); /* -ENOMEM */ - - if (split == start) { - key.offset = start; - } else { - BUG_ON(start != key.offset); - path->slots[0]--; - extent_end = end; - } - recow = 1; - } - - other_start = end; - other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, - ino, bytenr, orig_offset, - &other_start, &other_end)) { - if (recow) { - btrfs_release_path(path); - goto again; - } - extent_end = other_end; - del_slot = path->slots[0] + 1; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset, 0); - BUG_ON(ret); /* -ENOMEM */ - } - other_start = 0; - other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1, - ino, bytenr, orig_offset, - &other_start, &other_end)) { - if (recow) { - btrfs_release_path(path); - goto again; - } - key.offset = other_start; - del_slot = path->slots[0]; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - ino, orig_offset, 0); - BUG_ON(ret); /* -ENOMEM */ - } - if (del_nr == 0) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - btrfs_mark_buffer_dirty(leaf); - } else { - fi = btrfs_item_ptr(leaf, del_slot - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - } -out: - btrfs_free_path(path); - return 0; -} - -/* - * on error we return an unlocked page and the error value - * on success we return a locked page and 0 - */ -static int prepare_uptodate_page(struct page *page, u64 pos, - bool force_uptodate) -{ - int ret = 0; - - if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) && - !PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); - if (ret) - return ret; - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - return -EIO; - } - } - return 0; -} - -/* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. - */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - size_t write_bytes, bool force_uptodate) -{ - struct extent_state *cached_state = NULL; - int i; - unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = fdentry(file)->d_inode; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - int err = 0; - int faili = 0; - u64 start_pos; - u64 last_pos; - - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; - -again: - for (i = 0; i < num_pages; i++) { - pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask | __GFP_WRITE); - if (!pages[i]) { - faili = i - 1; - err = -ENOMEM; - goto fail; - } - - if (i == 0) - err = prepare_uptodate_page(pages[i], pos, - force_uptodate); - if (i == num_pages - 1) - err = prepare_uptodate_page(pages[i], - pos + write_bytes, false); - if (err) { - page_cache_release(pages[i]); - faili = i - 1; - goto fail; - } - wait_on_page_writeback(pages[i]); - } - err = 0; - if (start_pos < inode->i_size) { - struct btrfs_ordered_extent *ordered; - lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); - if (ordered && - ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - goto again; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - - clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); - } - for (i = 0; i < num_pages; i++) { - if (clear_page_dirty_for_io(pages[i])) - account_page_redirty(pages[i]); - set_page_extent_mapped(pages[i]); - WARN_ON(!PageLocked(pages[i])); - } - return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - page_cache_release(pages[faili]); - faili--; - } - return err; - -} - -static noinline ssize_t __btrfs_buffered_write(struct file *file, - struct iov_iter *i, - loff_t pos) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; - unsigned long first_index; - size_t num_written = 0; - int nrptrs; - int ret = 0; - bool force_page_uptodate = false; - - nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); - nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); - nrptrs = max(nrptrs, 8); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - if (!pages) - return -ENOMEM; - - first_index = pos >> PAGE_CACHE_SHIFT; - - while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(iov_iter_count(i), - nrptrs * (size_t)PAGE_CACHE_SIZE - - offset); - size_t num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - size_t dirty_pages; - size_t copied; - - WARN_ON(num_pages > nrptrs); - - /* - * Fault pages before locking them in prepare_pages - * to avoid recursive lock - */ - if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { - ret = -EFAULT; - break; - } - - ret = btrfs_delalloc_reserve_space(inode, - num_pages << PAGE_CACHE_SHIFT); - if (ret) - break; - - /* - * This is going to setup the pages array with the number of - * pages we want, so we don't really need to worry about the - * contents of pages from loop to loop - */ - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, write_bytes, - force_page_uptodate); - if (ret) { - btrfs_delalloc_release_space(inode, - num_pages << PAGE_CACHE_SHIFT); - break; - } - - copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, i); - - /* - * if we have trouble faulting in the pages, fall - * back to one page at a time - */ - if (copied < write_bytes) - nrptrs = 1; - - if (copied == 0) { - force_page_uptodate = true; - dirty_pages = 0; - } else { - force_page_uptodate = false; - dirty_pages = (copied + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - } - - /* - * If we had a short copy we need to release the excess delaloc - * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space will decrement it, but - * we still have an outstanding extent for the chunk we actually - * managed to copy. - */ - if (num_pages > dirty_pages) { - if (copied > 0) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } - btrfs_delalloc_release_space(inode, - (num_pages - dirty_pages) << - PAGE_CACHE_SHIFT); - } - - if (copied > 0) { - ret = btrfs_dirty_pages(root, inode, pages, - dirty_pages, pos, copied, - NULL); - if (ret) { - btrfs_delalloc_release_space(inode, - dirty_pages << PAGE_CACHE_SHIFT); - btrfs_drop_pages(pages, num_pages); - break; - } - } - - btrfs_drop_pages(pages, num_pages); - - cond_resched(); - - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_pages); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - - pos += copied; - num_written += copied; - } - - kfree(pages); - - return num_written ? num_written : ret; -} - -static ssize_t __btrfs_direct_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = fdentry(file)->d_inode; - struct iov_iter i; - ssize_t written; - ssize_t written_buffered; - loff_t endbyte; - int err; - - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, - count, ocount); - - /* - * the generic O_DIRECT will update in-memory i_size after the - * DIOs are done. But our endio handlers that update the on - * disk i_size never update past the in memory i_size. So we - * need one more update here to catch any additions to the - * file - */ - if (inode->i_size != BTRFS_I(inode)->disk_i_size) { - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - mark_inode_dirty(inode); - } - - if (written < 0 || written == count) - return written; - - pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = __btrfs_buffered_write(file, &i, pos); - if (written_buffered < 0) { - err = written_buffered; - goto out; - } - endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); - if (err) - goto out; - written += written_buffered; - *ppos = pos + written_buffered; - invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); -out: - return written ? written : err; -} - -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; - u64 start_pos; - ssize_t num_written = 0; - ssize_t err = 0; - size_t count, ocount; - - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - - mutex_lock(&inode->i_mutex); - - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - count = ocount; - - current->backing_dev_info = inode->i_mapping->backing_dev_info; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - if (count == 0) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - err = file_remove_suid(file); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * If BTRFS flips readonly due to some impossible error - * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), - * although we have opened a file as writable, we have - * to stop this write operation to ensure FS consistency. - */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - mutex_unlock(&inode->i_mutex); - err = -EROFS; - goto out; - } - - err = btrfs_update_time(file); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - BTRFS_I(inode)->sequence++; - - start_pos = round_down(pos, root->sectorsize); - if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - } - - if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); - } else { - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, num_written); - - num_written = __btrfs_buffered_write(file, &i, pos); - if (num_written > 0) - *ppos = pos + num_written; - } - - mutex_unlock(&inode->i_mutex); - - /* - * we want to make sure fsync finds this change - * but we haven't joined a transaction running right now. - * - * Later on, someone is sure to update the inode and get the - * real transid recorded. - * - * We set last_trans now to the fs_info generation + 1, - * this will either be one more than the running transaction - * or the generation used for the next transaction if there isn't - * one running right now. - */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; - if (num_written > 0 || num_written == -EIOCBQUEUED) { - err = generic_write_sync(file, pos, num_written); - if (err < 0 && num_written > 0) - num_written = err; - } -out: - current->backing_dev_info = NULL; - return num_written ? num_written : err; -} - -int btrfs_release_file(struct inode *inode, struct file *filp) -{ - /* - * ordered_data_close is set by settattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. - */ - if (BTRFS_I(inode)->ordered_data_close) { - BTRFS_I(inode)->ordered_data_close = 0; - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(inode->i_mapping); - } - if (filp->private_data) - btrfs_ioctl_trans_end(filp); - return 0; -} - -/* - * fsync call for both files and directories. This logs the inode into - * the tree log instead of forcing full commits whenever possible. - * - * It needs to call filemap_fdatawait so that all ordered extent updates are - * in the metadata btree are up to date for copying to the log. - * - * It drops the inode mutex before doing the tree log commit. This is an - * important optimization for directories because holding the mutex prevents - * new operations on the dir while we write to disk. - */ -int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - struct btrfs_trans_handle *trans; - - trace_btrfs_sync_file(file, datasync); - - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - /* we wait first, since the writeback may change the inode */ - root->log_batch++; - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - /* - * check the transaction that last modified this inode - * and see if its already been committed - */ - if (!BTRFS_I(inode)->last_trans) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * if the last transaction that changed this file was before - * the current transaction, we can bail out now without any - * syncing - */ - smp_mb(); - if (BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed) { - BTRFS_I(inode)->last_trans = 0; - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * ok we haven't committed the transaction yet, lets do a commit - */ - if (file->private_data) - btrfs_ioctl_trans_end(file); - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); - goto out; - } - - ret = btrfs_log_dentry_safe(trans, root, dentry); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* we've logged all the items and now have a consistent - * version of the file in the log. It is possible that - * someone will come in and modify the file, but that's - * fine because the log is consistent on disk, and we - * have references to all of the file's extents - * - * It is possible that someone will come in and log the - * file again, but that will end up using the synchronization - * inside btrfs_sync_log to keep things safe. - */ - mutex_unlock(&inode->i_mutex); - - if (ret != BTRFS_NO_LOG_SYNC) { - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else - ret = btrfs_commit_transaction(trans, root); - } - } else { - ret = btrfs_end_transaction(trans, root); - } -out: - return ret > 0 ? -EIO : ret; -} - -static const struct vm_operations_struct btrfs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = btrfs_page_mkwrite, -}; - -static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) -{ - struct address_space *mapping = filp->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; - - file_accessed(filp); - vma->vm_ops = &btrfs_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; - - return 0; -} - -static long btrfs_fallocate(struct file *file, int mode, - loff_t offset, loff_t len) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; - - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; - - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) - return -EOPNOTSUPP; - - /* - * Make sure we have enough space before we do the - * allocation. - */ - ret = btrfs_check_data_free_space(inode, len); - if (ret) - return ret; - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; - - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, i_size_read(inode), - alloc_start); - if (ret) - goto out; - } - - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; - - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } - - cur_offset = alloc_start; - while (1) { - u64 actual_end; - - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); - break; - } - last_byte = min(extent_map_end(em), alloc_end); - actual_end = min_t(u64, extent_map_end(em), offset + len); - last_byte = (last_byte + mask) & ~mask; - - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - - if (ret < 0) { - free_extent_map(em); - break; - } - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size. - */ - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); - } - free_extent_map(em); - - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); -out: - mutex_unlock(&inode->i_mutex); - /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, len); - return ret; -} - -static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map *em; - struct extent_state *cached_state = NULL; - u64 lockstart = *offset; - u64 lockend = i_size_read(inode); - u64 start = *offset; - u64 orig_start = *offset; - u64 len = i_size_read(inode); - u64 last_end = 0; - int ret = 0; - - lockend = max_t(u64, root->sectorsize, lockend); - if (lockend <= lockstart) - lockend = lockstart + root->sectorsize; - - len = lockend - lockstart + 1; - - len = max_t(u64, len, root->sectorsize); - if (inode->i_size == 0) - return -ENXIO; - - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, - &cached_state); - - /* - * Delalloc is such a pain. If we have a hole and we have pending - * delalloc for a portion of the hole we will get back a hole that - * exists for the entire range since it hasn't been actually written - * yet. So to take care of this case we need to look for an extent just - * before the position we want in case there is outstanding delalloc - * going on here. - */ - if (origin == SEEK_HOLE && start != 0) { - if (start <= root->sectorsize) - em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, - root->sectorsize, 0); - else - em = btrfs_get_extent_fiemap(inode, NULL, 0, - start - root->sectorsize, - root->sectorsize, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - last_end = em->start + em->len; - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - free_extent_map(em); - } - - while (1) { - em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - break; - } - - if (em->block_start == EXTENT_MAP_HOLE) { - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - if (last_end <= orig_start) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - if (origin == SEEK_HOLE) { - *offset = start; - free_extent_map(em); - break; - } - } else { - if (origin == SEEK_DATA) { - if (em->block_start == EXTENT_MAP_DELALLOC) { - if (start >= inode->i_size) { - free_extent_map(em); - ret = -ENXIO; - break; - } - } - - *offset = start; - free_extent_map(em); - break; - } - } - - start = em->start + em->len; - last_end = em->start + em->len; - - if (em->block_start == EXTENT_MAP_DELALLOC) - last_end = min_t(u64, last_end, inode->i_size); - - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - free_extent_map(em); - ret = -ENXIO; - break; - } - free_extent_map(em); - cond_resched(); - } - if (!ret) - *offset = min(*offset, inode->i_size); -out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); - return ret; -} - -static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file->f_mapping->host; - int ret; - - mutex_lock(&inode->i_mutex); - switch (origin) { - case SEEK_END: - case SEEK_CUR: - offset = generic_file_llseek(file, offset, origin); - goto out; - case SEEK_DATA: - case SEEK_HOLE: - if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); - return -ENXIO; - } - - ret = find_desired_extent(inode, &offset, origin); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; - } - } - - if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { - offset = -EINVAL; - goto out; - } - if (offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; - goto out; - } - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } -out: - mutex_unlock(&inode->i_mutex); - return offset; -} - -const struct file_operations btrfs_file_operations = { - .llseek = btrfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .splice_read = generic_file_splice_read, - .aio_write = btrfs_file_aio_write, - .mmap = btrfs_file_mmap, - .open = generic_file_open, - .release = btrfs_release_file, - .fsync = btrfs_sync_file, - .fallocate = btrfs_fallocate, - .unlocked_ioctl = btrfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, -#endif -}; diff --git a/ANDROID_3.4.5/fs/btrfs/free-space-cache.c b/ANDROID_3.4.5/fs/btrfs/free-space-cache.c deleted file mode 100644 index 202008ec..00000000 --- a/ANDROID_3.4.5/fs/btrfs/free-space-cache.c +++ /dev/null @@ -1,2943 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/pagemap.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/math64.h> -#include <linux/ratelimit.h> -#include "ctree.h" -#include "free-space-cache.h" -#include "transaction.h" -#include "disk-io.h" -#include "extent_io.h" -#include "inode-map.h" - -#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) - -static int link_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info); - -static struct inode *__lookup_free_space_inode(struct btrfs_root *root, - struct btrfs_path *path, - u64 offset) -{ - struct btrfs_key key; - struct btrfs_key location; - struct btrfs_disk_key disk_key; - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct inode *inode = NULL; - int ret; - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) { - btrfs_release_path(path); - return ERR_PTR(-ENOENT); - } - - leaf = path->nodes[0]; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - btrfs_free_space_key(leaf, header, &disk_key); - btrfs_disk_key_to_cpu(&location, &disk_key); - btrfs_release_path(path); - - inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); - if (!inode) - return ERR_PTR(-ENOENT); - if (IS_ERR(inode)) - return inode; - if (is_bad_inode(inode)) { - iput(inode); - return ERR_PTR(-ENOENT); - } - - inode->i_mapping->flags &= ~__GFP_FS; - - return inode; -} - -struct inode *lookup_free_space_inode(struct btrfs_root *root, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path) -{ - struct inode *inode = NULL; - u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; - - spin_lock(&block_group->lock); - if (block_group->inode) - inode = igrab(block_group->inode); - spin_unlock(&block_group->lock); - if (inode) - return inode; - - inode = __lookup_free_space_inode(root, path, - block_group->key.objectid); - if (IS_ERR(inode)) - return inode; - - spin_lock(&block_group->lock); - if (!((BTRFS_I(inode)->flags & flags) == flags)) { - printk(KERN_INFO "Old style space inode found, converting.\n"); - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | - BTRFS_INODE_NODATACOW; - block_group->disk_cache_state = BTRFS_DC_CLEAR; - } - - if (!block_group->iref) { - block_group->inode = igrab(inode); - block_group->iref = 1; - } - spin_unlock(&block_group->lock); - - return inode; -} - -int __create_free_space_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 ino, u64 offset) -{ - struct btrfs_key key; - struct btrfs_disk_key disk_key; - struct btrfs_free_space_header *header; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; - int ret; - - ret = btrfs_insert_empty_inode(trans, root, path, ino); - if (ret) - return ret; - - /* We inline crc's for the free disk space cache */ - if (ino != BTRFS_FREE_INO_OBJECTID) - flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; - - leaf = path->nodes[0]; - inode_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_inode_item); - btrfs_item_key(leaf, &disk_key, path->slots[0]); - memset_extent_buffer(leaf, 0, (unsigned long)inode_item, - sizeof(*inode_item)); - btrfs_set_inode_generation(leaf, inode_item, trans->transid); - btrfs_set_inode_size(leaf, inode_item, 0); - btrfs_set_inode_nbytes(leaf, inode_item, 0); - btrfs_set_inode_uid(leaf, inode_item, 0); - btrfs_set_inode_gid(leaf, inode_item, 0); - btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, inode_item, flags); - btrfs_set_inode_nlink(leaf, inode_item, 1); - btrfs_set_inode_transid(leaf, inode_item, trans->transid); - btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(struct btrfs_free_space_header)); - if (ret < 0) { - btrfs_release_path(path); - return ret; - } - leaf = path->nodes[0]; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); - btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - return 0; -} - -int create_free_space_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path) -{ - int ret; - u64 ino; - - ret = btrfs_find_free_objectid(root, &ino); - if (ret < 0) - return ret; - - return __create_free_space_inode(root, trans, path, ino, - block_group->key.objectid); -} - -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode) -{ - struct btrfs_block_rsv *rsv; - u64 needed_bytes; - loff_t oldsize; - int ret = 0; - - rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; - - /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + - btrfs_calc_trans_metadata_size(root, 1); - - spin_lock(&trans->block_rsv->lock); - if (trans->block_rsv->reserved < needed_bytes) { - spin_unlock(&trans->block_rsv->lock); - trans->block_rsv = rsv; - return -ENOSPC; - } - spin_unlock(&trans->block_rsv->lock); - - oldsize = i_size_read(inode); - btrfs_i_size_write(inode, 0); - truncate_pagecache(inode, oldsize, 0); - - /* - * We don't need an orphan item because truncating the free space cache - * will never be split across transactions. - */ - ret = btrfs_truncate_inode_items(trans, root, inode, - 0, BTRFS_EXTENT_DATA_KEY); - - if (ret) { - trans->block_rsv = rsv; - btrfs_abort_transaction(trans, root, ret); - return ret; - } - - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_abort_transaction(trans, root, ret); - trans->block_rsv = rsv; - - return ret; -} - -static int readahead_cache(struct inode *inode) -{ - struct file_ra_state *ra; - unsigned long last_index; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return -ENOMEM; - - file_ra_state_init(ra, inode->i_mapping); - last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; - - page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); - - kfree(ra); - - return 0; -} - -struct io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_root *root; - unsigned long size; - int index; - int num_pages; - unsigned check_crcs:1; -}; - -static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, - struct btrfs_root *root) -{ - memset(io_ctl, 0, sizeof(struct io_ctl)); - io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, - GFP_NOFS); - if (!io_ctl->pages) - return -ENOMEM; - io_ctl->root = root; - if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) - io_ctl->check_crcs = 1; - return 0; -} - -static void io_ctl_free(struct io_ctl *io_ctl) -{ - kfree(io_ctl->pages); -} - -static void io_ctl_unmap_page(struct io_ctl *io_ctl) -{ - if (io_ctl->cur) { - kunmap(io_ctl->page); - io_ctl->cur = NULL; - io_ctl->orig = NULL; - } -} - -static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) -{ - WARN_ON(io_ctl->cur); - BUG_ON(io_ctl->index >= io_ctl->num_pages); - io_ctl->page = io_ctl->pages[io_ctl->index++]; - io_ctl->cur = kmap(io_ctl->page); - io_ctl->orig = io_ctl->cur; - io_ctl->size = PAGE_CACHE_SIZE; - if (clear) - memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); -} - -static void io_ctl_drop_pages(struct io_ctl *io_ctl) -{ - int i; - - io_ctl_unmap_page(io_ctl); - - for (i = 0; i < io_ctl->num_pages; i++) { - if (io_ctl->pages[i]) { - ClearPageChecked(io_ctl->pages[i]); - unlock_page(io_ctl->pages[i]); - page_cache_release(io_ctl->pages[i]); - } - } -} - -static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, - int uptodate) -{ - struct page *page; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - int i; - - for (i = 0; i < io_ctl->num_pages; i++) { - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { - io_ctl_drop_pages(io_ctl); - return -ENOMEM; - } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); - io_ctl_drop_pages(io_ctl); - return -EIO; - } - } - } - - for (i = 0; i < io_ctl->num_pages; i++) { - clear_page_dirty_for_io(io_ctl->pages[i]); - set_page_extent_mapped(io_ctl->pages[i]); - } - - return 0; -} - -static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) -{ - u64 *val; - - io_ctl_map_page(io_ctl, 1); - - /* - * Skip the csum areas. If we don't check crcs then we just have a - * 64bit chunk at the front of the first page. - */ - if (io_ctl->check_crcs) { - io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); - io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } - - val = io_ctl->cur; - *val = cpu_to_le64(generation); - io_ctl->cur += sizeof(u64); -} - -static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) -{ - u64 *gen; - - /* - * Skip the crc area. If we don't check crcs then we just have a 64bit - * chunk at the front of the first page. - */ - if (io_ctl->check_crcs) { - io_ctl->cur += sizeof(u32) * io_ctl->num_pages; - io_ctl->size -= sizeof(u64) + - (sizeof(u32) * io_ctl->num_pages); - } else { - io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; - } - - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "btrfs: space cache generation " - "(%Lu) does not match inode (%Lu)\n", *gen, - generation); - io_ctl_unmap_page(io_ctl); - return -EIO; - } - io_ctl->cur += sizeof(u64); - return 0; -} - -static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) -{ - u32 *tmp; - u32 crc = ~(u32)0; - unsigned offset = 0; - - if (!io_ctl->check_crcs) { - io_ctl_unmap_page(io_ctl); - return; - } - - if (index == 0) - offset = sizeof(u32) * io_ctl->num_pages; - - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, - PAGE_CACHE_SIZE - offset); - btrfs_csum_final(crc, (char *)&crc); - io_ctl_unmap_page(io_ctl); - tmp = kmap(io_ctl->pages[0]); - tmp += index; - *tmp = crc; - kunmap(io_ctl->pages[0]); -} - -static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) -{ - u32 *tmp, val; - u32 crc = ~(u32)0; - unsigned offset = 0; - - if (!io_ctl->check_crcs) { - io_ctl_map_page(io_ctl, 0); - return 0; - } - - if (index == 0) - offset = sizeof(u32) * io_ctl->num_pages; - - tmp = kmap(io_ctl->pages[0]); - tmp += index; - val = *tmp; - kunmap(io_ctl->pages[0]); - - io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, - PAGE_CACHE_SIZE - offset); - btrfs_csum_final(crc, (char *)&crc); - if (val != crc) { - printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " - "space cache\n"); - io_ctl_unmap_page(io_ctl); - return -EIO; - } - - return 0; -} - -static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, - void *bitmap) -{ - struct btrfs_free_space_entry *entry; - - if (!io_ctl->cur) - return -ENOSPC; - - entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); - entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : - BTRFS_FREE_SPACE_EXTENT; - io_ctl->cur += sizeof(struct btrfs_free_space_entry); - io_ctl->size -= sizeof(struct btrfs_free_space_entry); - - if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) - return 0; - - io_ctl_set_crc(io_ctl, io_ctl->index - 1); - - /* No more pages to map */ - if (io_ctl->index >= io_ctl->num_pages) - return 0; - - /* map the next page */ - io_ctl_map_page(io_ctl, 1); - return 0; -} - -static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) -{ - if (!io_ctl->cur) - return -ENOSPC; - - /* - * If we aren't at the start of the current page, unmap this one and - * map the next one if there is any left. - */ - if (io_ctl->cur != io_ctl->orig) { - io_ctl_set_crc(io_ctl, io_ctl->index - 1); - if (io_ctl->index >= io_ctl->num_pages) - return -ENOSPC; - io_ctl_map_page(io_ctl, 0); - } - - memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); - io_ctl_set_crc(io_ctl, io_ctl->index - 1); - if (io_ctl->index < io_ctl->num_pages) - io_ctl_map_page(io_ctl, 0); - return 0; -} - -static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) -{ - /* - * If we're not on the boundary we know we've modified the page and we - * need to crc the page. - */ - if (io_ctl->cur != io_ctl->orig) - io_ctl_set_crc(io_ctl, io_ctl->index - 1); - else - io_ctl_unmap_page(io_ctl); - - while (io_ctl->index < io_ctl->num_pages) { - io_ctl_map_page(io_ctl, 1); - io_ctl_set_crc(io_ctl, io_ctl->index - 1); - } -} - -static int io_ctl_read_entry(struct io_ctl *io_ctl, - struct btrfs_free_space *entry, u8 *type) -{ - struct btrfs_free_space_entry *e; - int ret; - - if (!io_ctl->cur) { - ret = io_ctl_check_crc(io_ctl, io_ctl->index); - if (ret) - return ret; - } - - e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); - *type = e->type; - io_ctl->cur += sizeof(struct btrfs_free_space_entry); - io_ctl->size -= sizeof(struct btrfs_free_space_entry); - - if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) - return 0; - - io_ctl_unmap_page(io_ctl); - - return 0; -} - -static int io_ctl_read_bitmap(struct io_ctl *io_ctl, - struct btrfs_free_space *entry) -{ - int ret; - - ret = io_ctl_check_crc(io_ctl, io_ctl->index); - if (ret) - return ret; - - memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); - io_ctl_unmap_page(io_ctl); - - return 0; -} - -int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_path *path, u64 offset) -{ - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct io_ctl io_ctl; - struct btrfs_key key; - struct btrfs_free_space *e, *n; - struct list_head bitmaps; - u64 num_entries; - u64 num_bitmaps; - u64 generation; - u8 type; - int ret = 0; - - INIT_LIST_HEAD(&bitmaps); - - /* Nothing in the space cache, goodbye */ - if (!i_size_read(inode)) - return 0; - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return 0; - else if (ret > 0) { - btrfs_release_path(path); - return 0; - } - - ret = -1; - - leaf = path->nodes[0]; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - num_entries = btrfs_free_space_entries(leaf, header); - num_bitmaps = btrfs_free_space_bitmaps(leaf, header); - generation = btrfs_free_space_generation(leaf, header); - btrfs_release_path(path); - - if (BTRFS_I(inode)->generation != generation) { - printk(KERN_ERR "btrfs: free space inode generation (%llu) did" - " not match free space cache generation (%llu)\n", - (unsigned long long)BTRFS_I(inode)->generation, - (unsigned long long)generation); - return 0; - } - - if (!num_entries) - return 0; - - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return ret; - - ret = readahead_cache(inode); - if (ret) - goto out; - - ret = io_ctl_prepare_pages(&io_ctl, inode, 1); - if (ret) - goto out; - - ret = io_ctl_check_crc(&io_ctl, 0); - if (ret) - goto free_cache; - - ret = io_ctl_check_generation(&io_ctl, generation); - if (ret) - goto free_cache; - - while (num_entries) { - e = kmem_cache_zalloc(btrfs_free_space_cachep, - GFP_NOFS); - if (!e) - goto free_cache; - - ret = io_ctl_read_entry(&io_ctl, e, &type); - if (ret) { - kmem_cache_free(btrfs_free_space_cachep, e); - goto free_cache; - } - - if (!e->bytes) { - kmem_cache_free(btrfs_free_space_cachep, e); - goto free_cache; - } - - if (type == BTRFS_FREE_SPACE_EXTENT) { - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kmem_cache_free(btrfs_free_space_cachep, e); - goto free_cache; - } - } else { - BUG_ON(!num_bitmaps); - num_bitmaps--; - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!e->bitmap) { - kmem_cache_free( - btrfs_free_space_cachep, e); - goto free_cache; - } - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kmem_cache_free(btrfs_free_space_cachep, e); - goto free_cache; - } - list_add_tail(&e->list, &bitmaps); - } - - num_entries--; - } - - io_ctl_unmap_page(&io_ctl); - - /* - * We add the bitmaps at the end of the entries in order that - * the bitmap entries are added to the cache. - */ - list_for_each_entry_safe(e, n, &bitmaps, list) { - list_del_init(&e->list); - ret = io_ctl_read_bitmap(&io_ctl, e); - if (ret) - goto free_cache; - } - - io_ctl_drop_pages(&io_ctl); - ret = 1; -out: - io_ctl_free(&io_ctl); - return ret; -free_cache: - io_ctl_drop_pages(&io_ctl); - __btrfs_remove_free_space_cache(ctl); - goto out; -} - -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_root *root = fs_info->tree_root; - struct inode *inode; - struct btrfs_path *path; - int ret = 0; - bool matched; - u64 used = btrfs_block_group_used(&block_group->item); - - /* - * If this block group has been marked to be cleared for one reason or - * another then we can't trust the on disk cache, so just return. - */ - spin_lock(&block_group->lock); - if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { - spin_unlock(&block_group->lock); - return 0; - } - spin_unlock(&block_group->lock); - - path = btrfs_alloc_path(); - if (!path) - return 0; - path->search_commit_root = 1; - path->skip_locking = 1; - - inode = lookup_free_space_inode(root, block_group, path); - if (IS_ERR(inode)) { - btrfs_free_path(path); - return 0; - } - - /* We may have converted the inode and made the cache invalid. */ - spin_lock(&block_group->lock); - if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { - spin_unlock(&block_group->lock); - btrfs_free_path(path); - goto out; - } - spin_unlock(&block_group->lock); - - ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, - path, block_group->key.objectid); - btrfs_free_path(path); - if (ret <= 0) - goto out; - - spin_lock(&ctl->tree_lock); - matched = (ctl->free_space == (block_group->key.offset - used - - block_group->bytes_super)); - spin_unlock(&ctl->tree_lock); - - if (!matched) { - __btrfs_remove_free_space_cache(ctl); - printk(KERN_ERR "block group %llu has an wrong amount of free " - "space\n", block_group->key.objectid); - ret = -1; - } -out: - if (ret < 0) { - /* This cache is bogus, make sure it gets cleared */ - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_CLEAR; - spin_unlock(&block_group->lock); - ret = 0; - - printk(KERN_ERR "btrfs: failed to load free space cache " - "for block group %llu\n", block_group->key.objectid); - } - - iput(inode); - return ret; -} - -/** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle - * @path - the path to use - * @offset - the offset for the key we'll insert - * - * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. - */ -int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) -{ - struct btrfs_free_space_header *header; - struct extent_buffer *leaf; - struct rb_node *node; - struct list_head *pos, *n; - struct extent_state *cached_state = NULL; - struct btrfs_free_cluster *cluster = NULL; - struct extent_io_tree *unpin = NULL; - struct io_ctl io_ctl; - struct list_head bitmap_list; - struct btrfs_key key; - u64 start, extent_start, extent_end, len; - int entries = 0; - int bitmaps = 0; - int ret; - int err = -1; - - INIT_LIST_HEAD(&bitmap_list); - - if (!i_size_read(inode)) - return -1; - - ret = io_ctl_init(&io_ctl, inode, root); - if (ret) - return -1; - - /* Get the cluster for this block_group if it exists */ - if (block_group && !list_empty(&block_group->cluster_list)) - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); - - /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); - - node = rb_first(&ctl->free_space_offset); - if (!node && cluster) { - node = rb_first(&cluster->root); - cluster = NULL; - } - - /* Make sure we can fit our crcs into the first page */ - if (io_ctl.check_crcs && - (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { - WARN_ON(1); - goto out_nospc; - } - - io_ctl_set_generation(&io_ctl, trans->transid); - - /* Write out the extent entries */ - while (node) { - struct btrfs_free_space *e; - - e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; - - ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, - e->bitmap); - if (ret) - goto out_nospc; - - if (e->bitmap) { - list_add_tail(&e->list, &bitmap_list); - bitmaps++; - } - node = rb_next(node); - if (!node && cluster) { - node = rb_first(&cluster->root); - cluster = NULL; - } - } - - /* - * We want to add any pinned extents to our free space cache - * so we don't leak the space - */ - - /* - * We shouldn't have switched the pinned extents yet so this is the - * right one - */ - unpin = root->fs_info->pinned_extents; - - if (block_group) - start = block_group->key.objectid; - - while (block_group && (start < block_group->key.objectid + - block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY); - if (ret) { - ret = 0; - break; - } - - /* This pinned extent is out of our range */ - if (extent_start >= block_group->key.objectid + - block_group->key.offset) - break; - - extent_start = max(extent_start, start); - extent_end = min(block_group->key.objectid + - block_group->key.offset, extent_end + 1); - len = extent_end - extent_start; - - entries++; - ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL); - if (ret) - goto out_nospc; - - start = extent_end; - } - - /* Write out the bitmaps */ - list_for_each_safe(pos, n, &bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - - ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); - if (ret) - goto out_nospc; - list_del_init(&entry->list); - } - - /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); - - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, - 0, i_size_read(inode), &cached_state); - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); - - if (ret) - goto out; - - - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - goto out; - - key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; - key.type = 0; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); - goto out; - } - leaf = path->nodes[0]; - if (ret > 0) { - struct btrfs_key found_key; - BUG_ON(!path->slots[0]); - path->slots[0]--; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || - found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); - btrfs_release_path(path); - goto out; - } - } - - BTRFS_I(inode)->generation = trans->transid; - header = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_free_space_header); - btrfs_set_free_space_entries(leaf, header, entries); - btrfs_set_free_space_bitmaps(leaf, header, bitmaps); - btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - err = 0; -out: - io_ctl_free(&io_ctl); - if (err) { - invalidate_inode_pages2(inode->i_mapping); - BTRFS_I(inode)->generation = 0; - } - btrfs_update_inode(trans, root, inode); - return err; - -out_nospc: - list_for_each_safe(pos, n, &bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - list_del_init(&entry->list); - } - io_ctl_drop_pages(&io_ctl); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, GFP_NOFS); - goto out; -} - -int btrfs_write_out_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct inode *inode; - int ret = 0; - - root = root->fs_info->tree_root; - - spin_lock(&block_group->lock); - if (block_group->disk_cache_state < BTRFS_DC_SETUP) { - spin_unlock(&block_group->lock); - return 0; - } - spin_unlock(&block_group->lock); - - inode = lookup_free_space_inode(root, block_group, path); - if (IS_ERR(inode)) - return 0; - - ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, - path, block_group->key.objectid); - if (ret) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_ERROR; - spin_unlock(&block_group->lock); - ret = 0; -#ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free space cache " - "for block group %llu\n", block_group->key.objectid); -#endif - } - - iput(inode); - return ret; -} - -static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, - u64 offset) -{ - BUG_ON(offset < bitmap_start); - offset -= bitmap_start; - return (unsigned long)(div_u64(offset, unit)); -} - -static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) -{ - return (unsigned long)(div_u64(bytes, unit)); -} - -static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, - u64 offset) -{ - u64 bitmap_start; - u64 bytes_per_bitmap; - - bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; - bitmap_start = offset - ctl->start; - bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); - bitmap_start *= bytes_per_bitmap; - bitmap_start += ctl->start; - - return bitmap_start; -} - -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node, int bitmap) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_free_space *info; - - while (*p) { - parent = *p; - info = rb_entry(parent, struct btrfs_free_space, offset_index); - - if (offset < info->offset) { - p = &(*p)->rb_left; - } else if (offset > info->offset) { - p = &(*p)->rb_right; - } else { - /* - * we could have a bitmap entry and an extent entry - * share the same offset. If this is the case, we want - * the extent entry to always be found first if we do a - * linear search through the tree, since we want to have - * the quickest allocation time, and allocating from an - * extent is faster than allocating from a bitmap. So - * if we're inserting a bitmap and we find an entry at - * this offset, we want to go right, or after this entry - * logically. If we are inserting an extent and we've - * found a bitmap, we want to go left, or before - * logically. - */ - if (bitmap) { - if (info->bitmap) { - WARN_ON_ONCE(1); - return -EEXIST; - } - p = &(*p)->rb_right; - } else { - if (!info->bitmap) { - WARN_ON_ONCE(1); - return -EEXIST; - } - p = &(*p)->rb_left; - } - } - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - - return 0; -} - -/* - * searches the tree for the given offset. - * - * fuzzy - If this is set, then we are trying to make an allocation, and we just - * want a section that has at least bytes size and comes at or after the given - * offset. - */ -static struct btrfs_free_space * -tree_search_offset(struct btrfs_free_space_ctl *ctl, - u64 offset, int bitmap_only, int fuzzy) -{ - struct rb_node *n = ctl->free_space_offset.rb_node; - struct btrfs_free_space *entry, *prev = NULL; - - /* find entry that is closest to the 'offset' */ - while (1) { - if (!n) { - entry = NULL; - break; - } - - entry = rb_entry(n, struct btrfs_free_space, offset_index); - prev = entry; - - if (offset < entry->offset) - n = n->rb_left; - else if (offset > entry->offset) - n = n->rb_right; - else - break; - } - - if (bitmap_only) { - if (!entry) - return NULL; - if (entry->bitmap) - return entry; - - /* - * bitmap entry and extent entry may share same offset, - * in that case, bitmap entry comes after extent entry. - */ - n = rb_next(n); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_free_space, offset_index); - if (entry->offset != offset) - return NULL; - - WARN_ON(!entry->bitmap); - return entry; - } else if (entry) { - if (entry->bitmap) { - /* - * if previous extent entry covers the offset, - * we should return it instead of the bitmap entry - */ - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; - prev = rb_entry(n, struct btrfs_free_space, - offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - entry = prev; - break; - } - } - } - return entry; - } - - if (!prev) - return NULL; - - /* find last entry before the 'offset' */ - entry = prev; - if (entry->offset > offset) { - n = rb_prev(&entry->offset_index); - if (n) { - entry = rb_entry(n, struct btrfs_free_space, - offset_index); - BUG_ON(entry->offset > offset); - } else { - if (fuzzy) - return entry; - else - return NULL; - } - } - - if (entry->bitmap) { - n = &entry->offset_index; - while (1) { - n = rb_prev(n); - if (!n) - break; - prev = rb_entry(n, struct btrfs_free_space, - offset_index); - if (!prev->bitmap) { - if (prev->offset + prev->bytes > offset) - return prev; - break; - } - } - if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) - return entry; - } else if (entry->offset + entry->bytes > offset) - return entry; - - if (!fuzzy) - return NULL; - - while (1) { - if (entry->bitmap) { - if (entry->offset + BITS_PER_BITMAP * - ctl->unit > offset) - break; - } else { - if (entry->offset + entry->bytes > offset) - break; - } - - n = rb_next(&entry->offset_index); - if (!n) - return NULL; - entry = rb_entry(n, struct btrfs_free_space, offset_index); - } - return entry; -} - -static inline void -__unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - rb_erase(&info->offset_index, &ctl->free_space_offset); - ctl->free_extents--; -} - -static void unlink_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - __unlink_free_space(ctl, info); - ctl->free_space -= info->bytes; -} - -static int link_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - int ret = 0; - - BUG_ON(!info->bitmap && !info->bytes); - ret = tree_insert_offset(&ctl->free_space_offset, info->offset, - &info->offset_index, (info->bitmap != NULL)); - if (ret) - return ret; - - ctl->free_space += info->bytes; - ctl->free_extents++; - return ret; -} - -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_block_group_cache *block_group = ctl->private; - u64 max_bytes; - u64 bitmap_bytes; - u64 extent_bytes; - u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; - int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - - BUG_ON(ctl->total_bitmaps > max_bitmaps); - - /* - * The goal is to keep the total amount of memory used per 1gb of space - * at or below 32k, so we need to adjust how much memory we allow to be - * used by extent based free space tracking - */ - if (size < 1024 * 1024 * 1024) - max_bytes = MAX_CACHE_BYTES_PER_GIG; - else - max_bytes = MAX_CACHE_BYTES_PER_GIG * - div64_u64(size, 1024 * 1024 * 1024); - - /* - * we want to account for 1 more bitmap than what we have so we can make - * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as - * we add more bitmaps. - */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; - - if (bitmap_bytes >= max_bytes) { - ctl->extents_thresh = 0; - return; - } - - /* - * we want the extent entry threshold to always be at most 1/2 the maxw - * bytes we can have, or whatever is less than that. - */ - extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); - - ctl->extents_thresh = - div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); -} - -static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, - u64 offset, u64 bytes) -{ - unsigned long start, count; - - start = offset_to_bit(info->offset, ctl->unit, offset); - count = bytes_to_bits(bytes, ctl->unit); - BUG_ON(start + count > BITS_PER_BITMAP); - - bitmap_clear(info->bitmap, start, count); - - info->bytes -= bytes; -} - -static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - __bitmap_clear_bits(ctl, info, offset, bytes); - ctl->free_space -= bytes; -} - -static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - unsigned long start, count; - - start = offset_to_bit(info->offset, ctl->unit, offset); - count = bytes_to_bits(bytes, ctl->unit); - BUG_ON(start + count > BITS_PER_BITMAP); - - bitmap_set(info->bitmap, start, count); - - info->bytes += bytes; - ctl->free_space += bytes; -} - -static int search_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info, u64 *offset, - u64 *bytes) -{ - unsigned long found_bits = 0; - unsigned long bits, i; - unsigned long next_zero; - - i = offset_to_bit(bitmap_info->offset, ctl->unit, - max_t(u64, *offset, bitmap_info->offset)); - bits = bytes_to_bits(*bytes, ctl->unit); - - for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { - next_zero = find_next_zero_bit(bitmap_info->bitmap, - BITS_PER_BITMAP, i); - if ((next_zero - i) >= bits) { - found_bits = next_zero - i; - break; - } - i = next_zero; - } - - if (found_bits) { - *offset = (u64)(i * ctl->unit) + bitmap_info->offset; - *bytes = (u64)(found_bits) * ctl->unit; - return 0; - } - - return -1; -} - -static struct btrfs_free_space * -find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) -{ - struct btrfs_free_space *entry; - struct rb_node *node; - int ret; - - if (!ctl->free_space_offset.rb_node) - return NULL; - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); - if (!entry) - return NULL; - - for (node = &entry->offset_index; node; node = rb_next(node)) { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (entry->bytes < *bytes) - continue; - - if (entry->bitmap) { - ret = search_bitmap(ctl, entry, offset, bytes); - if (!ret) - return entry; - continue; - } - - *offset = entry->offset; - *bytes = entry->bytes; - return entry; - } - - return NULL; -} - -static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset) -{ - info->offset = offset_to_bitmap(ctl, offset); - info->bytes = 0; - INIT_LIST_HEAD(&info->list); - link_free_space(ctl, info); - ctl->total_bitmaps++; - - ctl->op->recalc_thresholds(ctl); -} - -static void free_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info) -{ - unlink_free_space(ctl, bitmap_info); - kfree(bitmap_info->bitmap); - kmem_cache_free(btrfs_free_space_cachep, bitmap_info); - ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); -} - -static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *bitmap_info, - u64 *offset, u64 *bytes) -{ - u64 end; - u64 search_start, search_bytes; - int ret; - -again: - end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; - - /* - * XXX - this can go away after a few releases. - * - * since the only user of btrfs_remove_free_space is the tree logging - * stuff, and the only way to test that is under crash conditions, we - * want to have this debug stuff here just in case somethings not - * working. Search the bitmap for the space we are trying to use to - * make sure its actually there. If its not there then we need to stop - * because something has gone wrong. - */ - search_start = *offset; - search_bytes = *bytes; - search_bytes = min(search_bytes, end - search_start + 1); - ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); - BUG_ON(ret < 0 || search_start != *offset); - - if (*offset > bitmap_info->offset && *offset + *bytes > end) { - bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1); - *bytes -= end - *offset + 1; - *offset = end + 1; - } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { - bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes); - *bytes = 0; - } - - if (*bytes) { - struct rb_node *next = rb_next(&bitmap_info->offset_index); - if (!bitmap_info->bytes) - free_bitmap(ctl, bitmap_info); - - /* - * no entry after this bitmap, but we still have bytes to - * remove, so something has gone wrong. - */ - if (!next) - return -EINVAL; - - bitmap_info = rb_entry(next, struct btrfs_free_space, - offset_index); - - /* - * if the next entry isn't a bitmap we need to return to let the - * extent stuff do its work. - */ - if (!bitmap_info->bitmap) - return -EAGAIN; - - /* - * Ok the next item is a bitmap, but it may not actually hold - * the information for the rest of this free space stuff, so - * look for it, and if we don't find it return so we can try - * everything over again. - */ - search_start = *offset; - search_bytes = *bytes; - ret = search_bitmap(ctl, bitmap_info, &search_start, - &search_bytes); - if (ret < 0 || search_start != *offset) - return -EAGAIN; - - goto again; - } else if (!bitmap_info->bytes) - free_bitmap(ctl, bitmap_info); - - return 0; -} - -static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, u64 offset, - u64 bytes) -{ - u64 bytes_to_set = 0; - u64 end; - - end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); - - bytes_to_set = min(end - offset, bytes); - - bitmap_set_bits(ctl, info, offset, bytes_to_set); - - return bytes_to_set; - -} - -static bool use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - struct btrfs_block_group_cache *block_group = ctl->private; - - /* - * If we are below the extents threshold then we can add this as an - * extent, and don't have to deal with the bitmap - */ - if (ctl->free_extents < ctl->extents_thresh) { - /* - * If this block group has some small extents we don't want to - * use up all of our free slots in the cache with them, we want - * to reserve them to larger extents, however if we have plent - * of cache left then go ahead an dadd them, no sense in adding - * the overhead of a bitmap if we don't have to. - */ - if (info->bytes <= block_group->sectorsize * 4) { - if (ctl->free_extents * 2 <= ctl->extents_thresh) - return false; - } else { - return false; - } - } - - /* - * some block groups are so tiny they can't be enveloped by a bitmap, so - * don't even bother to create a bitmap for this - */ - if (BITS_PER_BITMAP * block_group->sectorsize > - block_group->key.offset) - return false; - - return true; -} - -static struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - -static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - struct btrfs_free_space *bitmap_info; - struct btrfs_block_group_cache *block_group = NULL; - int added = 0; - u64 bytes, offset, bytes_added; - int ret; - - bytes = info->bytes; - offset = info->offset; - - if (!ctl->op->use_bitmap(ctl, info)) - return 0; - - if (ctl->op == &free_space_op) - block_group = ctl->private; -again: - /* - * Since we link bitmaps right into the cluster we need to see if we - * have a cluster here, and if so and it has our bitmap we need to add - * the free space to that bitmap. - */ - if (block_group && !list_empty(&block_group->cluster_list)) { - struct btrfs_free_cluster *cluster; - struct rb_node *node; - struct btrfs_free_space *entry; - - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); - spin_lock(&cluster->lock); - node = rb_first(&cluster->root); - if (!node) { - spin_unlock(&cluster->lock); - goto no_cluster_bitmap; - } - - entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (!entry->bitmap) { - spin_unlock(&cluster->lock); - goto no_cluster_bitmap; - } - - if (entry->offset == offset_to_bitmap(ctl, offset)) { - bytes_added = add_bytes_to_bitmap(ctl, entry, - offset, bytes); - bytes -= bytes_added; - offset += bytes_added; - } - spin_unlock(&cluster->lock); - if (!bytes) { - ret = 1; - goto out; - } - } - -no_cluster_bitmap: - bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), - 1, 0); - if (!bitmap_info) { - BUG_ON(added); - goto new_bitmap; - } - - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); - bytes -= bytes_added; - offset += bytes_added; - added = 0; - - if (!bytes) { - ret = 1; - goto out; - } else - goto again; - -new_bitmap: - if (info && info->bitmap) { - add_new_bitmap(ctl, info, offset); - added = 1; - info = NULL; - goto again; - } else { - spin_unlock(&ctl->tree_lock); - - /* no pre-allocated info, allocate a new one */ - if (!info) { - info = kmem_cache_zalloc(btrfs_free_space_cachep, - GFP_NOFS); - if (!info) { - spin_lock(&ctl->tree_lock); - ret = -ENOMEM; - goto out; - } - } - - /* allocate the bitmap */ - info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - spin_lock(&ctl->tree_lock); - if (!info->bitmap) { - ret = -ENOMEM; - goto out; - } - goto again; - } - -out: - if (info) { - if (info->bitmap) - kfree(info->bitmap); - kmem_cache_free(btrfs_free_space_cachep, info); - } - - return ret; -} - -static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info, bool update_stat) -{ - struct btrfs_free_space *left_info; - struct btrfs_free_space *right_info; - bool merged = false; - u64 offset = info->offset; - u64 bytes = info->bytes; - - /* - * first we want to see if there is free space adjacent to the range we - * are adding, if there is remove that struct and add a new one to - * cover the entire range - */ - right_info = tree_search_offset(ctl, offset + bytes, 0, 0); - if (right_info && rb_prev(&right_info->offset_index)) - left_info = rb_entry(rb_prev(&right_info->offset_index), - struct btrfs_free_space, offset_index); - else - left_info = tree_search_offset(ctl, offset - 1, 0, 0); - - if (right_info && !right_info->bitmap) { - if (update_stat) - unlink_free_space(ctl, right_info); - else - __unlink_free_space(ctl, right_info); - info->bytes += right_info->bytes; - kmem_cache_free(btrfs_free_space_cachep, right_info); - merged = true; - } - - if (left_info && !left_info->bitmap && - left_info->offset + left_info->bytes == offset) { - if (update_stat) - unlink_free_space(ctl, left_info); - else - __unlink_free_space(ctl, left_info); - info->offset = left_info->offset; - info->bytes += left_info->bytes; - kmem_cache_free(btrfs_free_space_cachep, left_info); - merged = true; - } - - return merged; -} - -int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, - u64 offset, u64 bytes) -{ - struct btrfs_free_space *info; - int ret = 0; - - info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - - spin_lock(&ctl->tree_lock); - - if (try_merge_free_space(ctl, info, true)) - goto link; - - /* - * There was no extent directly to the left or right of this new - * extent then we know we're going to have to allocate a new extent, so - * before we do that see if we need to drop this into a bitmap - */ - ret = insert_into_bitmap(ctl, info); - if (ret < 0) { - goto out; - } else if (ret) { - ret = 0; - goto out; - } -link: - ret = link_free_space(ctl, info); - if (ret) - kmem_cache_free(btrfs_free_space_cachep, info); -out: - spin_unlock(&ctl->tree_lock); - - if (ret) { - printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); - BUG_ON(ret == -EEXIST); - } - - return ret; -} - -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *info; - struct btrfs_free_space *next_info = NULL; - int ret = 0; - - spin_lock(&ctl->tree_lock); - -again: - info = tree_search_offset(ctl, offset, 0, 0); - if (!info) { - /* - * oops didn't find an extent that matched the space we wanted - * to remove, look for a bitmap instead - */ - info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), - 1, 0); - if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. - */ - goto out_lock; - } - } - - if (info->bytes < bytes && rb_next(&info->offset_index)) { - u64 end; - next_info = rb_entry(rb_next(&info->offset_index), - struct btrfs_free_space, - offset_index); - - if (next_info->bitmap) - end = next_info->offset + - BITS_PER_BITMAP * ctl->unit - 1; - else - end = next_info->offset + next_info->bytes; - - if (next_info->bytes < bytes || - next_info->offset > offset || offset > end) { - printk(KERN_CRIT "Found free space at %llu, size %llu," - " trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); - WARN_ON(1); - ret = -EINVAL; - goto out_lock; - } - - info = next_info; - } - - if (info->bytes == bytes) { - unlink_free_space(ctl, info); - if (info->bitmap) { - kfree(info->bitmap); - ctl->total_bitmaps--; - } - kmem_cache_free(btrfs_free_space_cachep, info); - ret = 0; - goto out_lock; - } - - if (!info->bitmap && info->offset == offset) { - unlink_free_space(ctl, info); - info->offset += bytes; - info->bytes -= bytes; - ret = link_free_space(ctl, info); - WARN_ON(ret); - goto out_lock; - } - - if (!info->bitmap && info->offset <= offset && - info->offset + info->bytes >= offset + bytes) { - u64 old_start = info->offset; - /* - * we're freeing space in the middle of the info, - * this can happen during tree log replay - * - * first unlink the old info and then - * insert it again after the hole we're creating - */ - unlink_free_space(ctl, info); - if (offset + bytes < info->offset + info->bytes) { - u64 old_end = info->offset + info->bytes; - - info->offset = offset + bytes; - info->bytes = old_end - info->offset; - ret = link_free_space(ctl, info); - WARN_ON(ret); - if (ret) - goto out_lock; - } else { - /* the hole we're creating ends at the end - * of the info struct, just free the info - */ - kmem_cache_free(btrfs_free_space_cachep, info); - } - spin_unlock(&ctl->tree_lock); - - /* step two, insert a new info struct to cover - * anything before the hole - */ - ret = btrfs_add_free_space(block_group, old_start, - offset - old_start); - WARN_ON(ret); /* -ENOMEM */ - goto out; - } - - ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) - goto again; - BUG_ON(ret); /* logic error */ -out_lock: - spin_unlock(&ctl->tree_lock); -out: - return ret; -} - -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *info; - struct rb_node *n; - int count = 0; - - for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { - info = rb_entry(n, struct btrfs_free_space, offset_index); - if (info->bytes >= bytes) - count++; - printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (info->bitmap) ? "yes" : "no"); - } - printk(KERN_INFO "block group has cluster?: %s\n", - list_empty(&block_group->cluster_list) ? "no" : "yes"); - printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" - "\n", count); -} - -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - - spin_lock_init(&ctl->tree_lock); - ctl->unit = block_group->sectorsize; - ctl->start = block_group->key.objectid; - ctl->private = block_group; - ctl->op = &free_space_op; - - /* - * we only want to have 32k of ram per block group for keeping - * track of free space, and if we pass 1/2 of that we want to - * start converting things over to using bitmaps - */ - ctl->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); -} - -/* - * for a given cluster, put all of its extents back into the free - * space cache. If the block group passed doesn't match the block group - * pointed to by the cluster, someone else raced in and freed the - * cluster already. In that case, we just return without changing anything - */ -static int -__btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; - struct rb_node *node; - - spin_lock(&cluster->lock); - if (cluster->block_group != block_group) - goto out; - - cluster->block_group = NULL; - cluster->window_start = 0; - list_del_init(&cluster->block_group_list); - - node = rb_first(&cluster->root); - while (node) { - bool bitmap; - - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - rb_erase(&entry->offset_index, &cluster->root); - - bitmap = (entry->bitmap != NULL); - if (!bitmap) - try_merge_free_space(ctl, entry, false); - tree_insert_offset(&ctl->free_space_offset, - entry->offset, &entry->offset_index, bitmap); - } - cluster->root = RB_ROOT; - -out: - spin_unlock(&cluster->lock); - btrfs_put_block_group(block_group); - return 0; -} - -void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *node; - - while ((node = rb_last(&ctl->free_space_offset)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, offset_index); - if (!info->bitmap) { - unlink_free_space(ctl, info); - kmem_cache_free(btrfs_free_space_cachep, info); - } else { - free_bitmap(ctl, info); - } - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } - } -} - -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) -{ - spin_lock(&ctl->tree_lock); - __btrfs_remove_free_space_cache_locked(ctl); - spin_unlock(&ctl->tree_lock); -} - -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_cluster *cluster; - struct list_head *head; - - spin_lock(&ctl->tree_lock); - while ((head = block_group->cluster_list.next) != - &block_group->cluster_list) { - cluster = list_entry(head, struct btrfs_free_cluster, - block_group_list); - - WARN_ON(cluster->block_group != block_group); - __btrfs_return_cluster_to_free_space(block_group, cluster); - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } - } - __btrfs_remove_free_space_cache_locked(ctl); - spin_unlock(&ctl->tree_lock); - -} - -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry = NULL; - u64 bytes_search = bytes + empty_size; - u64 ret = 0; - - spin_lock(&ctl->tree_lock); - entry = find_free_space(ctl, &offset, &bytes_search); - if (!entry) - goto out; - - ret = offset; - if (entry->bitmap) { - bitmap_clear_bits(ctl, entry, offset, bytes); - if (!entry->bytes) - free_bitmap(ctl, entry); - } else { - unlink_free_space(ctl, entry); - entry->offset += bytes; - entry->bytes -= bytes; - if (!entry->bytes) - kmem_cache_free(btrfs_free_space_cachep, entry); - else - link_free_space(ctl, entry); - } - -out: - spin_unlock(&ctl->tree_lock); - - return ret; -} - -/* - * given a cluster, put all of its extents back into the free space - * cache. If a block group is passed, this function will only free - * a cluster that belongs to the passed block group. - * - * Otherwise, it'll get a reference on the block group pointed to by the - * cluster and remove the cluster from it. - */ -int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster) -{ - struct btrfs_free_space_ctl *ctl; - int ret; - - /* first, get a safe pointer to the block group */ - spin_lock(&cluster->lock); - if (!block_group) { - block_group = cluster->block_group; - if (!block_group) { - spin_unlock(&cluster->lock); - return 0; - } - } else if (cluster->block_group != block_group) { - /* someone else has already freed it don't redo their work */ - spin_unlock(&cluster->lock); - return 0; - } - atomic_inc(&block_group->count); - spin_unlock(&cluster->lock); - - ctl = block_group->free_space_ctl; - - /* now return any extents the cluster had on it */ - spin_lock(&ctl->tree_lock); - ret = __btrfs_return_cluster_to_free_space(block_group, cluster); - spin_unlock(&ctl->tree_lock); - - /* finally drop our ref */ - btrfs_put_block_group(block_group); - return ret; -} - -static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct btrfs_free_space *entry, - u64 bytes, u64 min_start) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - int err; - u64 search_start = cluster->window_start; - u64 search_bytes = bytes; - u64 ret = 0; - - search_start = min_start; - search_bytes = bytes; - - err = search_bitmap(ctl, entry, &search_start, &search_bytes); - if (err) - return 0; - - ret = search_start; - __bitmap_clear_bits(ctl, entry, ret, bytes); - - return ret; -} - -/* - * given a cluster, try to allocate 'bytes' from it, returns 0 - * if it couldn't find anything suitably large, or a logical disk offset - * if things worked out - */ -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, u64 bytes, - u64 min_start) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry = NULL; - struct rb_node *node; - u64 ret = 0; - - spin_lock(&cluster->lock); - if (bytes > cluster->max_size) - goto out; - - if (cluster->block_group != block_group) - goto out; - - node = rb_first(&cluster->root); - if (!node) - goto out; - - entry = rb_entry(node, struct btrfs_free_space, offset_index); - while(1) { - if (entry->bytes < bytes || - (!entry->bitmap && entry->offset < min_start)) { - node = rb_next(&entry->offset_index); - if (!node) - break; - entry = rb_entry(node, struct btrfs_free_space, - offset_index); - continue; - } - - if (entry->bitmap) { - ret = btrfs_alloc_from_bitmap(block_group, - cluster, entry, bytes, - cluster->window_start); - if (ret == 0) { - node = rb_next(&entry->offset_index); - if (!node) - break; - entry = rb_entry(node, struct btrfs_free_space, - offset_index); - continue; - } - cluster->window_start += bytes; - } else { - ret = entry->offset; - - entry->offset += bytes; - entry->bytes -= bytes; - } - - if (entry->bytes == 0) - rb_erase(&entry->offset_index, &cluster->root); - break; - } -out: - spin_unlock(&cluster->lock); - - if (!ret) - return 0; - - spin_lock(&ctl->tree_lock); - - ctl->free_space -= bytes; - if (entry->bytes == 0) { - ctl->free_extents--; - if (entry->bitmap) { - kfree(entry->bitmap); - ctl->total_bitmaps--; - ctl->op->recalc_thresholds(ctl); - } - kmem_cache_free(btrfs_free_space_cachep, entry); - } - - spin_unlock(&ctl->tree_lock); - - return ret; -} - -static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *entry, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, - u64 cont1_bytes, u64 min_bytes) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - unsigned long next_zero; - unsigned long i; - unsigned long want_bits; - unsigned long min_bits; - unsigned long found_bits; - unsigned long start = 0; - unsigned long total_found = 0; - int ret; - - i = offset_to_bit(entry->offset, block_group->sectorsize, - max_t(u64, offset, entry->offset)); - want_bits = bytes_to_bits(bytes, block_group->sectorsize); - min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); - -again: - found_bits = 0; - for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); - i < BITS_PER_BITMAP; - i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { - next_zero = find_next_zero_bit(entry->bitmap, - BITS_PER_BITMAP, i); - if (next_zero - i >= min_bits) { - found_bits = next_zero - i; - break; - } - i = next_zero; - } - - if (!found_bits) - return -ENOSPC; - - if (!total_found) { - start = i; - cluster->max_size = 0; - } - - total_found += found_bits; - - if (cluster->max_size < found_bits * block_group->sectorsize) - cluster->max_size = found_bits * block_group->sectorsize; - - if (total_found < want_bits || cluster->max_size < cont1_bytes) { - i = next_zero + 1; - goto again; - } - - cluster->window_start = start * block_group->sectorsize + - entry->offset; - rb_erase(&entry->offset_index, &ctl->free_space_offset); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 1); - BUG_ON(ret); /* -EEXIST; Logic error */ - - trace_btrfs_setup_cluster(block_group, cluster, - total_found * block_group->sectorsize, 1); - return 0; -} - -/* - * This searches the block group for just extents to fill the cluster with. - * Try to find a cluster with at least bytes total bytes, at least one - * extent of cont1_bytes, and other clusters of at least min_bytes. - */ -static noinline int -setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct list_head *bitmaps, u64 offset, u64 bytes, - u64 cont1_bytes, u64 min_bytes) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *first = NULL; - struct btrfs_free_space *entry = NULL; - struct btrfs_free_space *last; - struct rb_node *node; - u64 window_start; - u64 window_free; - u64 max_extent; - u64 total_size = 0; - - entry = tree_search_offset(ctl, offset, 0, 1); - if (!entry) - return -ENOSPC; - - /* - * We don't want bitmaps, so just move along until we find a normal - * extent entry. - */ - while (entry->bitmap || entry->bytes < min_bytes) { - if (entry->bitmap && list_empty(&entry->list)) - list_add_tail(&entry->list, bitmaps); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - } - - window_start = entry->offset; - window_free = entry->bytes; - max_extent = entry->bytes; - first = entry; - last = entry; - - for (node = rb_next(&entry->offset_index); node; - node = rb_next(&entry->offset_index)) { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - - if (entry->bitmap) { - if (list_empty(&entry->list)) - list_add_tail(&entry->list, bitmaps); - continue; - } - - if (entry->bytes < min_bytes) - continue; - - last = entry; - window_free += entry->bytes; - if (entry->bytes > max_extent) - max_extent = entry->bytes; - } - - if (window_free < bytes || max_extent < cont1_bytes) - return -ENOSPC; - - cluster->window_start = first->offset; - - node = &first->offset_index; - - /* - * now we've found our entries, pull them out of the free space - * cache and put them into the cluster rbtree - */ - do { - int ret; - - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (entry->bitmap || entry->bytes < min_bytes) - continue; - - rb_erase(&entry->offset_index, &ctl->free_space_offset); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 0); - total_size += entry->bytes; - BUG_ON(ret); /* -EEXIST; Logic error */ - } while (node && entry != last); - - cluster->max_size = max_extent; - trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); - return 0; -} - -/* - * This specifically looks for bitmaps that may work in the cluster, we assume - * that we have already failed to find extents that will work. - */ -static noinline int -setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct list_head *bitmaps, u64 offset, u64 bytes, - u64 cont1_bytes, u64 min_bytes) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; - int ret = -ENOSPC; - u64 bitmap_offset = offset_to_bitmap(ctl, offset); - - if (ctl->total_bitmaps == 0) - return -ENOSPC; - - /* - * The bitmap that covers offset won't be in the list unless offset - * is just its start offset. - */ - entry = list_first_entry(bitmaps, struct btrfs_free_space, list); - if (entry->offset != bitmap_offset) { - entry = tree_search_offset(ctl, bitmap_offset, 1, 0); - if (entry && list_empty(&entry->list)) - list_add(&entry->list, bitmaps); - } - - list_for_each_entry(entry, bitmaps, list) { - if (entry->bytes < bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, cont1_bytes, min_bytes); - if (!ret) - return 0; - } - - /* - * The bitmaps list has all the bitmaps that record free space - * starting after offset, so no more search is required. - */ - return -ENOSPC; -} - -/* - * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes+empty_size. - * We might not find them all in one contiguous area. - * - * returns zero and sets up cluster if things worked out, otherwise - * it returns -enospc - */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 empty_size) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry, *tmp; - LIST_HEAD(bitmaps); - u64 min_bytes; - u64 cont1_bytes; - int ret; - - /* - * Choose the minimum extent size we'll require for this - * cluster. For SSD_SPREAD, don't allow any fragmentation. - * For metadata, allow allocates with smaller extents. For - * data, keep it dense. - */ - if (btrfs_test_opt(root, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; - } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - cont1_bytes = bytes; - min_bytes = block_group->sectorsize; - } else { - cont1_bytes = max(bytes, (bytes + empty_size) >> 2); - min_bytes = block_group->sectorsize; - } - - spin_lock(&ctl->tree_lock); - - /* - * If we know we don't have enough space to make a cluster don't even - * bother doing all the work to try and find one. - */ - if (ctl->free_space < bytes) { - spin_unlock(&ctl->tree_lock); - return -ENOSPC; - } - - spin_lock(&cluster->lock); - - /* someone already found a cluster, hooray */ - if (cluster->block_group) { - ret = 0; - goto out; - } - - trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, - min_bytes); - - INIT_LIST_HEAD(&bitmaps); - ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, - bytes + empty_size, - cont1_bytes, min_bytes); - if (ret) - ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, - offset, bytes + empty_size, - cont1_bytes, min_bytes); - - /* Clear our temporary list */ - list_for_each_entry_safe(entry, tmp, &bitmaps, list) - list_del_init(&entry->list); - - if (!ret) { - atomic_inc(&block_group->count); - list_add_tail(&cluster->block_group_list, - &block_group->cluster_list); - cluster->block_group = block_group; - } else { - trace_btrfs_failed_cluster_setup(block_group); - } -out: - spin_unlock(&cluster->lock); - spin_unlock(&ctl->tree_lock); - - return ret; -} - -/* - * simple code to zero out a cluster - */ -void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) -{ - spin_lock_init(&cluster->lock); - spin_lock_init(&cluster->refill_lock); - cluster->root = RB_ROOT; - cluster->max_size = 0; - INIT_LIST_HEAD(&cluster->block_group_list); - cluster->block_group = NULL; -} - -static int do_trimming(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 bytes, - u64 reserved_start, u64 reserved_bytes) -{ - struct btrfs_space_info *space_info = block_group->space_info; - struct btrfs_fs_info *fs_info = block_group->fs_info; - int ret; - int update = 0; - u64 trimmed = 0; - - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (!block_group->ro) { - block_group->reserved += reserved_bytes; - space_info->bytes_reserved += reserved_bytes; - update = 1; - } - spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); - - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, bytes, &trimmed); - if (!ret) - *total_trimmed += trimmed; - - btrfs_add_free_space(block_group, reserved_start, reserved_bytes); - - if (update) { - spin_lock(&space_info->lock); - spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += reserved_bytes; - block_group->reserved -= reserved_bytes; - space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); - spin_unlock(&block_group->lock); - } - - return ret; -} - -static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; - struct rb_node *node; - int ret = 0; - u64 extent_start; - u64 extent_bytes; - u64 bytes; - - while (start < end) { - spin_lock(&ctl->tree_lock); - - if (ctl->free_space < minlen) { - spin_unlock(&ctl->tree_lock); - break; - } - - entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) { - spin_unlock(&ctl->tree_lock); - break; - } - - /* skip bitmaps */ - while (entry->bitmap) { - node = rb_next(&entry->offset_index); - if (!node) { - spin_unlock(&ctl->tree_lock); - goto out; - } - entry = rb_entry(node, struct btrfs_free_space, - offset_index); - } - - if (entry->offset >= end) { - spin_unlock(&ctl->tree_lock); - break; - } - - extent_start = entry->offset; - extent_bytes = entry->bytes; - start = max(start, extent_start); - bytes = min(extent_start + extent_bytes, end) - start; - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - goto next; - } - - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); - - spin_unlock(&ctl->tree_lock); - - ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes); - if (ret) - break; -next: - start += bytes; - - if (fatal_signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - cond_resched(); - } -out: - return ret; -} - -static int trim_bitmaps(struct btrfs_block_group_cache *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) -{ - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; - int ret = 0; - int ret2; - u64 bytes; - u64 offset = offset_to_bitmap(ctl, start); - - while (offset < end) { - bool next_bitmap = false; - - spin_lock(&ctl->tree_lock); - - if (ctl->free_space < minlen) { - spin_unlock(&ctl->tree_lock); - break; - } - - entry = tree_search_offset(ctl, offset, 1, 0); - if (!entry) { - spin_unlock(&ctl->tree_lock); - next_bitmap = true; - goto next; - } - - bytes = minlen; - ret2 = search_bitmap(ctl, entry, &start, &bytes); - if (ret2 || start >= end) { - spin_unlock(&ctl->tree_lock); - next_bitmap = true; - goto next; - } - - bytes = min(bytes, end - start); - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - goto next; - } - - bitmap_clear_bits(ctl, entry, start, bytes); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - - spin_unlock(&ctl->tree_lock); - - ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes); - if (ret) - break; -next: - if (next_bitmap) { - offset += BITS_PER_BITMAP * ctl->unit; - } else { - start += bytes; - if (start >= offset + BITS_PER_BITMAP * ctl->unit) - offset += BITS_PER_BITMAP * ctl->unit; - } - - if (fatal_signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - cond_resched(); - } - - return ret; -} - -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) -{ - int ret; - - *trimmed = 0; - - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); - if (ret) - return ret; - - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); - - return ret; -} - -/* - * Find the left-most item in the cache tree, and then return the - * smallest inode number in the item. - * - * Note: the returned inode number may not be the smallest one in - * the tree, if the left-most item is a bitmap. - */ -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) -{ - struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; - struct btrfs_free_space *entry = NULL; - u64 ino = 0; - - spin_lock(&ctl->tree_lock); - - if (RB_EMPTY_ROOT(&ctl->free_space_offset)) - goto out; - - entry = rb_entry(rb_first(&ctl->free_space_offset), - struct btrfs_free_space, offset_index); - - if (!entry->bitmap) { - ino = entry->offset; - - unlink_free_space(ctl, entry); - entry->offset++; - entry->bytes--; - if (!entry->bytes) - kmem_cache_free(btrfs_free_space_cachep, entry); - else - link_free_space(ctl, entry); - } else { - u64 offset = 0; - u64 count = 1; - int ret; - - ret = search_bitmap(ctl, entry, &offset, &count); - /* Logic error; Should be empty if it can't find anything */ - BUG_ON(ret); - - ino = offset; - bitmap_clear_bits(ctl, entry, offset, 1); - if (entry->bytes == 0) - free_bitmap(ctl, entry); - } -out: - spin_unlock(&ctl->tree_lock); - - return ino; -} - -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct inode *inode = NULL; - - spin_lock(&root->cache_lock); - if (root->cache_inode) - inode = igrab(root->cache_inode); - spin_unlock(&root->cache_lock); - if (inode) - return inode; - - inode = __lookup_free_space_inode(root, path, 0); - if (IS_ERR(inode)) - return inode; - - spin_lock(&root->cache_lock); - if (!btrfs_fs_closing(root->fs_info)) - root->cache_inode = igrab(inode); - spin_unlock(&root->cache_lock); - - return inode; -} - -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - return __create_free_space_inode(root, trans, path, - BTRFS_FREE_INO_OBJECTID, 0); -} - -int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - int ret = 0; - u64 root_gen = btrfs_root_generation(&root->root_item); - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return 0; - - /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return 0; - - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - goto out; - - if (root_gen != BTRFS_I(inode)->generation) - goto out_put; - - ret = __load_free_space_cache(root, inode, ctl, path, 0); - - if (ret < 0) - printk(KERN_ERR "btrfs: failed to load free ino cache for " - "root %llu\n", root->root_key.objectid); -out_put: - iput(inode); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct inode *inode; - int ret; - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return 0; - - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode)) - return 0; - - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); - if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); -#ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free ino cache " - "for root %llu\n", root->root_key.objectid); -#endif - } - - iput(inode); - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/free-space-cache.h b/ANDROID_3.4.5/fs/btrfs/free-space-cache.h deleted file mode 100644 index 8f2613f7..00000000 --- a/ANDROID_3.4.5/fs/btrfs/free-space-cache.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_FREE_SPACE_CACHE -#define __BTRFS_FREE_SPACE_CACHE - -struct btrfs_free_space { - struct rb_node offset_index; - u64 offset; - u64 bytes; - unsigned long *bitmap; - struct list_head list; -}; - -struct btrfs_free_space_ctl { - spinlock_t tree_lock; - struct rb_root free_space_offset; - u64 free_space; - int extents_thresh; - int free_extents; - int total_bitmaps; - int unit; - u64 start; - struct btrfs_free_space_op *op; - void *private; -}; - -struct btrfs_free_space_op { - void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl); - bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info); -}; - -struct inode *lookup_free_space_inode(struct btrfs_root *root, - struct btrfs_block_group_cache - *block_group, struct btrfs_path *path); -int create_free_space_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path); - -int btrfs_truncate_free_space_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct inode *inode); -int load_free_space_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group); -int btrfs_write_out_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path); - -struct inode *lookup_free_ino_inode(struct btrfs_root *root, - struct btrfs_path *path); -int create_free_ino_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path); -int load_free_ino_cache(struct btrfs_fs_info *fs_info, - struct btrfs_root *root); -int btrfs_write_out_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path); - -void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group); -int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, - u64 bytenr, u64 size); -static inline int -btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size) -{ - return __btrfs_add_free_space(block_group->free_space_ctl, - bytenr, size); -} -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes, u64 empty_size); -u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes); -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 empty_size); -void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); -u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, u64 bytes, - u64 min_start); -int btrfs_return_cluster_to_free_space( - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster); -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/hash.h b/ANDROID_3.4.5/fs/btrfs/hash.h deleted file mode 100644 index db2ff977..00000000 --- a/ANDROID_3.4.5/fs/btrfs/hash.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __HASH__ -#define __HASH__ - -#include <linux/crc32c.h> -static inline u64 btrfs_name_hash(const char *name, int len) -{ - return crc32c((u32)~1, name, len); -} -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/inode-item.c b/ANDROID_3.4.5/fs/btrfs/inode-item.c deleted file mode 100644 index a13cf1a9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/inode-item.c +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "print-tree.h" - -static int find_name_in_backref(struct btrfs_path *path, const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) -{ - struct extent_buffer *leaf; - struct btrfs_inode_ref *ref; - unsigned long ptr; - unsigned long name_ptr; - u32 item_size; - u32 cur_offset = 0; - int len; - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - while (cur_offset < item_size) { - ref = (struct btrfs_inode_ref *)(ptr + cur_offset); - len = btrfs_inode_ref_name_len(leaf, ref); - name_ptr = (unsigned long)(ref + 1); - cur_offset += len + sizeof(*ref); - if (len != name_len) - continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - *ref_ret = ref; - return 1; - } - } - return 0; -} - -struct btrfs_inode_ref * -btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, int mod) -{ - struct btrfs_key key; - struct btrfs_inode_ref *ref; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - int ret; - - key.objectid = inode_objectid; - key.type = BTRFS_INODE_REF_KEY; - key.offset = ref_objectid; - - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return NULL; - if (!find_name_in_backref(path, name, name_len, &ref)) - return NULL; - return ref; -} - -int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index) -{ - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_inode_ref *ref; - struct extent_buffer *leaf; - unsigned long ptr; - unsigned long item_start; - u32 item_size; - u32 sub_item_len; - int ret; - int del_len = name_len + sizeof(*ref); - - key.objectid = inode_objectid; - key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) { - ret = -ENOENT; - goto out; - } else if (ret < 0) { - goto out; - } - if (!find_name_in_backref(path, name, name_len, &ref)) { - ret = -ENOENT; - goto out; - } - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - - if (index) - *index = btrfs_inode_ref_index(leaf, ref); - - if (del_len == item_size) { - ret = btrfs_del_item(trans, root, path); - goto out; - } - ptr = (unsigned long)ref; - sub_item_len = name_len + sizeof(*ref); - item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); - memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, - item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(trans, root, path, - item_size - sub_item_len, 1); -out: - btrfs_free_path(path); - return ret; -} - -/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ -int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index) -{ - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_inode_ref *ref; - unsigned long ptr; - int ret; - int ins_len = name_len + sizeof(*ref); - - key.objectid = inode_objectid; - key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, root, path, &key, - ins_len); - if (ret == -EEXIST) { - u32 old_size; - - if (find_name_in_backref(path, name, name_len, &ref)) - goto out; - - old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - btrfs_extend_item(trans, root, path, ins_len); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_ref); - ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, index); - ptr = (unsigned long)(ref + 1); - ret = 0; - } else if (ret < 0) { - if (ret == -EOVERFLOW) - ret = -EMLINK; - goto out; - } else { - ref = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, index); - ptr = (unsigned long)(ref + 1); - } - write_extent_buffer(path->nodes[0], name, ptr, name_len); - btrfs_mark_buffer_dirty(path->nodes[0]); - -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid) -{ - struct btrfs_key key; - int ret; - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(struct btrfs_inode_item)); - return ret; -} - -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, - struct btrfs_key *location, int mod) -{ - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; - int ret; - int slot; - struct extent_buffer *leaf; - struct btrfs_key found_key; - - ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); - if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && - location->offset == (u64)-1 && path->slots[0] != 0) { - slot = path->slots[0] - 1; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid == location->objectid && - btrfs_key_type(&found_key) == btrfs_key_type(location)) { - path->slots[0]--; - return 0; - } - } - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/inode-map.c b/ANDROID_3.4.5/fs/btrfs/inode-map.c deleted file mode 100644 index b1a1c929..00000000 --- a/ANDROID_3.4.5/fs/btrfs/inode-map.c +++ /dev/null @@ -1,576 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/delay.h> -#include <linux/kthread.h> -#include <linux/pagemap.h> - -#include "ctree.h" -#include "disk-io.h" -#include "free-space-cache.h" -#include "inode-map.h" -#include "transaction.h" - -static int caching_kthread(void *data) -{ - struct btrfs_root *root = data; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - u64 last = (u64)-1; - int slot; - int ret; - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* Since the commit root is read-only, we can safely skip locking. */ - path->skip_locking = 1; - path->search_commit_root = 1; - path->reada = 2; - - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; -again: - /* need to make sure the commit_root doesn't disappear */ - mutex_lock(&root->fs_commit_mutex); - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - if (btrfs_fs_closing(fs_info)) - goto out; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - if (btrfs_header_nritems(leaf) == 0) { - WARN_ON(1); - break; - } - - /* - * Save the key so we can advances forward - * in the next search. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(path); - root->cache_progress = last; - mutex_unlock(&root->fs_commit_mutex); - schedule_timeout(1); - goto again; - } else - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.type != BTRFS_INODE_ITEM_KEY) - goto next; - - if (key.objectid >= root->highest_objectid) - break; - - if (last != (u64)-1 && last + 1 != key.objectid) { - __btrfs_add_free_space(ctl, last + 1, - key.objectid - last - 1); - wake_up(&root->cache_wait); - } - - last = key.objectid; -next: - path->slots[0]++; - } - - if (last < root->highest_objectid - 1) { - __btrfs_add_free_space(ctl, last + 1, - root->highest_objectid - last - 1); - } - - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); - - root->cache_progress = (u64)-1; - btrfs_unpin_free_ino(root); -out: - wake_up(&root->cache_wait); - mutex_unlock(&root->fs_commit_mutex); - - btrfs_free_path(path); - - return ret; -} - -static void start_caching(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct task_struct *tsk; - int ret; - u64 objectid; - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return; - - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_NO) { - spin_unlock(&root->cache_lock); - return; - } - - root->cached = BTRFS_CACHE_STARTED; - spin_unlock(&root->cache_lock); - - ret = load_free_ino_cache(root->fs_info, root); - if (ret == 1) { - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); - return; - } - - /* - * It can be quite time-consuming to fill the cache by searching - * through the extent tree, and this can keep ino allocation path - * waiting. Therefore at start we quickly find out the highest - * inode number and we know we can use inode numbers which fall in - * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. - */ - ret = btrfs_find_free_objectid(root, &objectid); - if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { - __btrfs_add_free_space(ctl, objectid, - BTRFS_LAST_FREE_OBJECTID - objectid + 1); - } - - tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", - root->root_key.objectid); - BUG_ON(IS_ERR(tsk)); /* -ENOMEM */ -} - -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) -{ - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return btrfs_find_free_objectid(root, objectid); - -again: - *objectid = btrfs_find_ino_for_alloc(root); - - if (*objectid != 0) - return 0; - - start_caching(root); - - wait_event(root->cache_wait, - root->cached == BTRFS_CACHE_FINISHED || - root->free_ino_ctl->free_space > 0); - - if (root->cached == BTRFS_CACHE_FINISHED && - root->free_ino_ctl->free_space == 0) - return -ENOSPC; - else - goto again; -} - -void btrfs_return_ino(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return; - -again: - if (root->cached == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(ctl, objectid, 1); - } else { - /* - * If we are in the process of caching free ino chunks, - * to avoid adding the same inode number to the free_ino - * tree twice due to cross transaction, we'll leave it - * in the pinned tree until a transaction is committed - * or the caching work is done. - */ - - mutex_lock(&root->fs_commit_mutex); - spin_lock(&root->cache_lock); - if (root->cached == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->cache_lock); - mutex_unlock(&root->fs_commit_mutex); - goto again; - } - spin_unlock(&root->cache_lock); - - start_caching(root); - - if (objectid <= root->cache_progress || - objectid > root->highest_objectid) - __btrfs_add_free_space(ctl, objectid, 1); - else - __btrfs_add_free_space(pinned, objectid, 1); - - mutex_unlock(&root->fs_commit_mutex); - } -} - -/* - * When a transaction is committed, we'll move those inode numbers which - * are smaller than root->cache_progress from pinned tree to free_ino tree, - * and others will just be dropped, because the commit root we were - * searching has changed. - * - * Must be called with root->fs_commit_mutex held - */ -void btrfs_unpin_free_ino(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; - struct btrfs_free_space *info; - struct rb_node *n; - u64 count; - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return; - - while (1) { - n = rb_first(rbroot); - if (!n) - break; - - info = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(info->bitmap); /* Logic error */ - - if (info->offset > root->cache_progress) - goto free; - else if (info->offset + info->bytes > root->cache_progress) - count = root->cache_progress - info->offset + 1; - else - count = info->bytes; - - __btrfs_add_free_space(ctl, info->offset, count); -free: - rb_erase(&info->offset_index, rbroot); - kfree(info); - } -} - -#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) -#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) - -/* - * The goal is to keep the memory used by the free_ino tree won't - * exceed the memory if we use bitmaps only. - */ -static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) -{ - struct btrfs_free_space *info; - struct rb_node *n; - int max_ino; - int max_bitmaps; - - n = rb_last(&ctl->free_space_offset); - if (!n) { - ctl->extents_thresh = INIT_THRESHOLD; - return; - } - info = rb_entry(n, struct btrfs_free_space, offset_index); - - /* - * Find the maximum inode number in the filesystem. Note we - * ignore the fact that this can be a bitmap, because we are - * not doing precise calculation. - */ - max_ino = info->bytes - 1; - - max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; - if (max_bitmaps <= ctl->total_bitmaps) { - ctl->extents_thresh = 0; - return; - } - - ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * - PAGE_CACHE_SIZE / sizeof(*info); -} - -/* - * We don't fall back to bitmap, if we are below the extents threshold - * or this chunk of inode numbers is a big one. - */ -static bool use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - if (ctl->free_extents < ctl->extents_thresh || - info->bytes > INODES_PER_BITMAP / 10) - return false; - - return true; -} - -static struct btrfs_free_space_op free_ino_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - -static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) -{ -} - -static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, - struct btrfs_free_space *info) -{ - /* - * We always use extents for two reasons: - * - * - The pinned tree is only used during the process of caching - * work. - * - Make code simpler. See btrfs_unpin_free_ino(). - */ - return false; -} - -static struct btrfs_free_space_op pinned_free_ino_op = { - .recalc_thresholds = pinned_recalc_thresholds, - .use_bitmap = pinned_use_bitmap, -}; - -void btrfs_init_free_ino_ctl(struct btrfs_root *root) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - - spin_lock_init(&ctl->tree_lock); - ctl->unit = 1; - ctl->start = 0; - ctl->private = NULL; - ctl->op = &free_ino_op; - - /* - * Initially we allow to use 16K of ram to cache chunks of - * inode numbers before we resort to bitmaps. This is somewhat - * arbitrary, but it will be adjusted in runtime. - */ - ctl->extents_thresh = INIT_THRESHOLD; - - spin_lock_init(&pinned->tree_lock); - pinned->unit = 1; - pinned->start = 0; - pinned->private = NULL; - pinned->extents_thresh = 0; - pinned->op = &pinned_free_ino_op; -} - -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans) -{ - struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; - struct btrfs_path *path; - struct inode *inode; - struct btrfs_block_rsv *rsv; - u64 num_bytes; - u64 alloc_hint = 0; - int ret; - int prealloc; - bool retry = false; - - /* only fs tree and subvol/snap needs ino cache */ - if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && - (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || - root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) - return 0; - - /* Don't save inode cache if we are deleting this root */ - if (btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) - return 0; - - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->trans_block_rsv; - - num_bytes = trans->bytes_reserved; - /* - * 1 item for inode item insertion if need - * 3 items for inode item update (in the worst case) - * 1 item for free space object - * 3 items for pre-allocation - */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); - ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, - trans->bytes_reserved); - if (ret) - goto out; - trace_btrfs_space_reservation(root->fs_info, "ino_cache", - trans->transid, trans->bytes_reserved, 1); -again: - inode = lookup_free_ino_inode(root, path); - if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { - ret = PTR_ERR(inode); - goto out_release; - } - - if (IS_ERR(inode)) { - BUG_ON(retry); /* Logic error */ - retry = true; - - ret = create_free_ino_inode(root, trans, path); - if (ret) - goto out_release; - goto again; - } - - BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out_put; - } - - if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, path, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out_put; - } - } - - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_FINISHED) { - ret = -1; - spin_unlock(&root->cache_lock); - goto out_put; - } - spin_unlock(&root->cache_lock); - - spin_lock(&ctl->tree_lock); - prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; - prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); - prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; - spin_unlock(&ctl->tree_lock); - - /* Just to make sure we have enough space */ - prealloc += 8 * PAGE_CACHE_SIZE; - - ret = btrfs_delalloc_reserve_space(inode, prealloc); - if (ret) - goto out_put; - - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, - prealloc, prealloc, &alloc_hint); - if (ret) { - btrfs_delalloc_release_space(inode, prealloc); - goto out_put; - } - btrfs_free_reserved_data_space(inode, prealloc); - - ret = btrfs_write_out_ino_cache(root, trans, path); -out_put: - iput(inode); -out_release: - trace_btrfs_space_reservation(root->fs_info, "ino_cache", - trans->transid, trans->bytes_reserved, 0); - btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); -out: - trans->block_rsv = rsv; - trans->bytes_reserved = num_bytes; - - btrfs_free_path(path); - return ret; -} - -static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) -{ - struct btrfs_path *path; - int ret; - struct extent_buffer *l; - struct btrfs_key search_key; - struct btrfs_key found_key; - int slot; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - search_key.objectid = BTRFS_LAST_FREE_OBJECTID; - search_key.type = -1; - search_key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) - goto error; - BUG_ON(ret == 0); /* Corruption */ - if (path->slots[0] > 0) { - slot = path->slots[0] - 1; - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); - } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; - } - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) -{ - int ret; - mutex_lock(&root->objectid_mutex); - - if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); - if (ret) - goto out; - } - - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { - ret = -ENOSPC; - goto out; - } - - *objectid = ++root->highest_objectid; - ret = 0; -out: - mutex_unlock(&root->objectid_mutex); - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/inode-map.h b/ANDROID_3.4.5/fs/btrfs/inode-map.h deleted file mode 100644 index ddb347bf..00000000 --- a/ANDROID_3.4.5/fs/btrfs/inode-map.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __BTRFS_INODE_MAP -#define __BTRFS_INODE_MAP - -void btrfs_init_free_ino_ctl(struct btrfs_root *root); -void btrfs_unpin_free_ino(struct btrfs_root *root); -void btrfs_return_ino(struct btrfs_root *root, u64 objectid); -int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid); -int btrfs_save_ino_cache(struct btrfs_root *root, - struct btrfs_trans_handle *trans); - -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/inode.c b/ANDROID_3.4.5/fs/btrfs/inode.c deleted file mode 100644 index 0df0d1fd..00000000 --- a/ANDROID_3.4.5/fs/btrfs/inode.c +++ /dev/null @@ -1,7681 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/kernel.h> -#include <linux/bio.h> -#include <linux/buffer_head.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/backing-dev.h> -#include <linux/mpage.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/statfs.h> -#include <linux/compat.h> -#include <linux/bit_spinlock.h> -#include <linux/xattr.h> -#include <linux/posix_acl.h> -#include <linux/falloc.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> -#include <linux/mount.h> -#include "compat.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "ioctl.h" -#include "print-tree.h" -#include "ordered-data.h" -#include "xattr.h" -#include "tree-log.h" -#include "volumes.h" -#include "compression.h" -#include "locking.h" -#include "free-space-cache.h" -#include "inode-map.h" - -struct btrfs_iget_args { - u64 ino; - struct btrfs_root *root; -}; - -static const struct inode_operations btrfs_dir_inode_operations; -static const struct inode_operations btrfs_symlink_inode_operations; -static const struct inode_operations btrfs_dir_ro_inode_operations; -static const struct inode_operations btrfs_special_inode_operations; -static const struct inode_operations btrfs_file_inode_operations; -static const struct address_space_operations btrfs_aops; -static const struct address_space_operations btrfs_symlink_aops; -static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; - -static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_trans_handle_cachep; -struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_path_cachep; -struct kmem_cache *btrfs_free_space_cachep; - -#define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, - [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, - [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, -}; - -static int btrfs_setsize(struct inode *inode, loff_t newsize); -static int btrfs_truncate(struct inode *inode); -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); -static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode); - -static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) -{ - int err; - - err = btrfs_init_acl(trans, inode, dir); - if (!err) - err = btrfs_xattr_security_init(trans, inode, dir, qstr); - return err; -} - -/* - * this does all the hard work for inserting an inline extent into - * the btree. The caller should have done a btrfs_drop_extents so that - * no overlapping inline items exist in the btree - */ -static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, size_t size, size_t compressed_size, - int compress_type, - struct page **compressed_pages) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct page *page = NULL; - char *kaddr; - unsigned long ptr; - struct btrfs_file_extent_item *ei; - int err = 0; - int ret; - size_t cur_size = size; - size_t datasize; - unsigned long offset; - - if (compressed_size && compressed_pages) - cur_size = compressed_size; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - - key.objectid = btrfs_ino(inode); - key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(cur_size); - - inode_add_bytes(inode, size); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - goto fail; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, ei, trans->transid); - btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); - btrfs_set_file_extent_encryption(leaf, ei, 0); - btrfs_set_file_extent_other_encoding(leaf, ei, 0); - btrfs_set_file_extent_ram_bytes(leaf, ei, size); - ptr = btrfs_file_extent_inline_start(ei); - - if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_CACHE_SIZE); - - kaddr = kmap_atomic(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); - - i++; - ptr += cur_size; - compressed_size -= cur_size; - } - btrfs_set_file_extent_compression(leaf, ei, - compress_type); - } else { - page = find_get_page(inode->i_mapping, - start >> PAGE_CACHE_SHIFT); - btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); - offset = start & (PAGE_CACHE_SIZE - 1); - write_extent_buffer(leaf, kaddr + offset, ptr, size); - kunmap_atomic(kaddr); - page_cache_release(page); - } - btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); - - /* - * we're an inline extent, so nobody can - * extend the file past i_size without locking - * a page we already have locked. - * - * We must do any isize and inode updates - * before we unlock the pages. Otherwise we - * could end up racing with unlink. - */ - BTRFS_I(inode)->disk_i_size = inode->i_size; - ret = btrfs_update_inode(trans, root, inode); - - return ret; -fail: - btrfs_free_path(path); - return err; -} - - -/* - * conditionally insert an inline extent into the file. This - * does the checks required to make sure the data is small enough - * to fit as an inline extent. - */ -static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, u64 start, u64 end, - size_t compressed_size, int compress_type, - struct page **compressed_pages) -{ - u64 isize = i_size_read(inode); - u64 actual_end = min(end + 1, isize); - u64 inline_len = actual_end - start; - u64 aligned_end = (end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - u64 hint_byte; - u64 data_len = inline_len; - int ret; - - if (compressed_size) - data_len = compressed_size; - - if (start > 0 || - actual_end >= PAGE_CACHE_SIZE || - data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || - (!compressed_size && - (actual_end & (root->sectorsize - 1)) == 0) || - end + 1 < isize || - data_len > root->fs_info->max_inline) { - return 1; - } - - ret = btrfs_drop_extents(trans, inode, start, aligned_end, - &hint_byte, 1); - if (ret) - return ret; - - if (isize > actual_end) - inline_len = min_t(u64, isize, actual_end); - ret = insert_inline_extent(trans, root, inode, start, - inline_len, compressed_size, - compress_type, compressed_pages); - if (ret && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } else if (ret == -ENOSPC) { - return 1; - } - - btrfs_delalloc_release_metadata(inode, end + 1 - start); - btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); - return 0; -} - -struct async_extent { - u64 start; - u64 ram_size; - u64 compressed_size; - struct page **pages; - unsigned long nr_pages; - int compress_type; - struct list_head list; -}; - -struct async_cow { - struct inode *inode; - struct btrfs_root *root; - struct page *locked_page; - u64 start; - u64 end; - struct list_head extents; - struct btrfs_work work; -}; - -static noinline int add_async_extent(struct async_cow *cow, - u64 start, u64 ram_size, - u64 compressed_size, - struct page **pages, - unsigned long nr_pages, - int compress_type) -{ - struct async_extent *async_extent; - - async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); - BUG_ON(!async_extent); /* -ENOMEM */ - async_extent->start = start; - async_extent->ram_size = ram_size; - async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; - async_extent->compress_type = compress_type; - list_add_tail(&async_extent->list, &cow->extents); - return 0; -} - -/* - * we create compressed extents in two phases. The first - * phase compresses a range of pages that have already been - * locked (both pages and state bits are locked). - * - * This is done inside an ordered work queue, and the compression - * is spread across many cpus. The actual IO submission is step - * two, and the ordered work queue takes care of making sure that - * happens in the same order things were put onto the queue by - * writepages and friends. - * - * If this code finds it can't get good compression, it puts an - * entry onto the work queue to write the uncompressed bytes. This - * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that pdflush sent them down. - */ -static noinline int compress_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, - struct async_cow *async_cow, - int *num_added) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - u64 num_bytes; - u64 blocksize = root->sectorsize; - u64 actual_end; - u64 isize = i_size_read(inode); - int ret = 0; - struct page **pages = NULL; - unsigned long nr_pages; - unsigned long nr_pages_ret = 0; - unsigned long total_compressed = 0; - unsigned long total_in = 0; - unsigned long max_compressed = 128 * 1024; - unsigned long max_uncompressed = 128 * 1024; - int i; - int will_compress; - int compress_type = root->fs_info->compress_type; - - /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < 16 * 1024 && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode); - - actual_end = min_t(u64, isize, end + 1); -again: - will_compress = 0; - nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); - - /* - * we don't want to send crud past the end of i_size through - * compression, that's just a waste of CPU time. So, if the - * end of the file is before the start of our current - * requested range of bytes, we bail out to the uncompressed - * cleanup code that can deal with all of this. - * - * It isn't really the fastest way to fix things, but this is a - * very uncommon corner. - */ - if (actual_end <= start) - goto cleanup_and_bail_uncompressed; - - total_compressed = actual_end - start; - - /* we want to make sure that amount of ram required to uncompress - * an extent is reasonable, so we limit the total size in ram - * of a compressed extent to 128k. This is a crucial number - * because it also controls how easily we can spread reads across - * cpus for decompression. - * - * We also want to make sure the amount of IO required to do - * a random read is reasonably small, so we limit the size of - * a compressed extent to 128k. - */ - total_compressed = min(total_compressed, max_uncompressed); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); - num_bytes = max(blocksize, num_bytes); - total_in = 0; - ret = 0; - - /* - * we do compression for mount -o compress and when the - * inode has not been flagged as nocompress. This flag can - * change at any time if we discover bad compression ratios. - */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && - (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress) || - (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { - WARN_ON(pages); - pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - if (!pages) { - /* just bail out to the uncompressed code */ - goto cont; - } - - if (BTRFS_I(inode)->force_compress) - compress_type = BTRFS_I(inode)->force_compress; - - ret = btrfs_compress_pages(compress_type, - inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, - &total_in, - &total_compressed, - max_compressed); - - if (!ret) { - unsigned long offset = total_compressed & - (PAGE_CACHE_SIZE - 1); - struct page *page = pages[nr_pages_ret - 1]; - char *kaddr; - - /* zero the tail end of the last page, we might be - * sending it down to disk - */ - if (offset) { - kaddr = kmap_atomic(page); - memset(kaddr + offset, 0, - PAGE_CACHE_SIZE - offset); - kunmap_atomic(kaddr); - } - will_compress = 1; - } - } -cont: - if (start == 0) { - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto cleanup_and_out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - /* lets try to make an inline extent */ - if (ret || total_in < (actual_end - start)) { - /* we didn't compress the entire range, try - * to make an uncompressed inline extent. - */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, 0, NULL); - } else { - /* try making a compressed inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, - total_compressed, - compress_type, pages); - } - if (ret <= 0) { - /* - * inline extent creation worked or returned error, - * we don't need to create any more async work items. - * Unlock and free up our temp pages. - */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - - btrfs_end_transaction(trans, root); - goto free_pages_out; - } - btrfs_end_transaction(trans, root); - } - - if (will_compress) { - /* - * we aren't doing an inline extent round the compressed size - * up to a block size boundary so the allocator does sane - * things - */ - total_compressed = (total_compressed + blocksize - 1) & - ~(blocksize - 1); - - /* - * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk - */ - total_in = (total_in + PAGE_CACHE_SIZE - 1) & - ~(PAGE_CACHE_SIZE - 1); - if (total_compressed >= total_in) { - will_compress = 0; - } else { - num_bytes = total_in; - } - } - if (!will_compress && pages) { - /* - * the compression code ran but failed to make things smaller, - * free any pages it allocated and our page pointer array - */ - for (i = 0; i < nr_pages_ret; i++) { - WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); - } - kfree(pages); - pages = NULL; - total_compressed = 0; - nr_pages_ret = 0; - - /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS) && - !(BTRFS_I(inode)->force_compress)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - } - } - if (will_compress) { - *num_added += 1; - - /* the async work queues will take care of doing actual - * allocation on disk for these compressed pages, - * and will submit them to the elevator. - */ - add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, - compress_type); - - if (start + num_bytes < end) { - start += num_bytes; - pages = NULL; - cond_resched(); - goto again; - } - } else { -cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in - * the file we've been given so far. redirty the locked - * page if it corresponds to our extent and set things up - * for the async work queue to run cow_file_range to do - * the normal delalloc dance - */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) { - __set_page_dirty_nobuffers(locked_page); - /* unlocked later on in the async handlers */ - } - add_async_extent(async_cow, start, end - start + 1, - 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - } - -out: - return ret; - -free_pages_out: - for (i = 0; i < nr_pages_ret; i++) { - WARN_ON(pages[i]->mapping); - page_cache_release(pages[i]); - } - kfree(pages); - - goto out; - -cleanup_and_out: - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - if (!trans || IS_ERR(trans)) - btrfs_error(root->fs_info, ret, "Failed to join transaction"); - else - btrfs_abort_transaction(trans, root, ret); - goto free_pages_out; -} - -/* - * phase two of compressed writeback. This is the ordered portion - * of the code, which only gets called in the order the work was - * queued. We walk all the async extents created by compress_file_range - * and send them down to the disk. - */ -static noinline int submit_compressed_extents(struct inode *inode, - struct async_cow *async_cow) -{ - struct async_extent *async_extent; - u64 alloc_hint = 0; - struct btrfs_trans_handle *trans; - struct btrfs_key ins; - struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree; - int ret = 0; - - if (list_empty(&async_cow->extents)) - return 0; - - - while (!list_empty(&async_cow->extents)) { - async_extent = list_entry(async_cow->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - - io_tree = &BTRFS_I(inode)->io_tree; - -retry: - /* did the compression code fall back to uncompressed IO? */ - if (!async_extent->pages) { - int page_started = 0; - unsigned long nr_written = 0; - - lock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); - - /* allocate blocks */ - ret = cow_file_range(inode, async_cow->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); - - /* JDM XXX */ - - /* - * if page_started, cow_file_range inserted an - * inline extent and took care of all the unlocking - * and IO for us. Otherwise, we need to submit - * all those pages down to the drive. - */ - if (!page_started && !ret) - extent_write_locked_range(io_tree, - inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - btrfs_get_extent, - WB_SYNC_ALL); - kfree(async_extent); - cond_resched(); - continue; - } - - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_reserve_extent(trans, root, - async_extent->compressed_size, - async_extent->compressed_size, - 0, alloc_hint, &ins, 1); - if (ret) - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - } - - if (ret) { - int i; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); - } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); - if (ret == -ENOSPC) - goto retry; - goto out_free; /* JDM: Requeue? */ - } - - /* - * here we're doing allocation and writeback of the - * compressed pages - */ - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - - em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ - em->start = async_extent->start; - em->len = async_extent->ram_size; - em->orig_start = em->start; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; - em->compress_type = async_extent->compress_type; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - } - - ret = btrfs_add_ordered_extent_compress(inode, - async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - BTRFS_ORDERED_COMPRESSED, - async_extent->compress_type); - BUG_ON(ret); /* -ENOMEM */ - - /* - * clear dirty, set writeback and unlock the pages. - */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); - - ret = btrfs_submit_compressed_write(inode, - async_extent->start, - async_extent->ram_size, - ins.objectid, - ins.offset, async_extent->pages, - async_extent->nr_pages); - - BUG_ON(ret); /* -ENOMEM */ - alloc_hint = ins.objectid + ins.offset; - kfree(async_extent); - cond_resched(); - } - ret = 0; -out: - return ret; -out_free: - kfree(async_extent); - goto out; -} - -static u64 get_extent_allocation_hint(struct inode *inode, u64 start, - u64 num_bytes) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - u64 alloc_hint = 0; - - read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&em_tree->lock); - - return alloc_hint; -} - -/* - * when extent_io.c finds a delayed allocation range in the file, - * the call backs end up in this code. The basic idea is to - * allocate extents on disk for the range, and create ordered data structs - * in ram to track those extents. - * - * locked_page is the page that writepage had locked already. We use - * it to make sure we don't do extra locks or unlocks. - * - * *page_started is set to one if we unlock locked_page and do everything - * required to start IO on it. It may be clean and already done with - * IO when we return. - */ -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - u64 alloc_hint = 0; - u64 num_bytes; - unsigned long ram_size; - u64 disk_num_bytes; - u64 cur_alloc_size; - u64 blocksize = root->sectorsize; - struct btrfs_key ins; - struct extent_map *em; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - int ret = 0; - - BUG_ON(btrfs_is_free_space_inode(root, inode)); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - return PTR_ERR(trans); - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - num_bytes = (end - start + blocksize) & ~(blocksize - 1); - num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; - ret = 0; - - /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < 64 * 1024 && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - btrfs_add_inode_defrag(trans, inode); - - if (start == 0) { - /* lets try to make an inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, 0, NULL); - if (ret == 0) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - - *nr_written = *nr_written + - (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; - *page_started = 1; - goto out; - } else if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out_unlock; - } - } - - BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(root->fs_info->super_copy)); - - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); - btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); - - while (disk_num_bytes > 0) { - unsigned long op; - - cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(trans, root, cur_alloc_size, - root->sectorsize, 0, alloc_hint, - &ins, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out_unlock; - } - - em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ - em->start = start; - em->orig_start = em->start; - ram_size = ins.offset; - em->len = ins.offset; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, - start + ram_size - 1, 0); - } - - cur_alloc_size = ins.offset; - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, 0); - BUG_ON(ret); /* -ENOMEM */ - - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_reloc_clone_csums(inode, start, - cur_alloc_size); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out_unlock; - } - } - - if (disk_num_bytes < cur_alloc_size) - break; - - /* we're not doing compressed IO, don't unlock the first - * page (which the caller expects to stay locked), don't - * clear any dirty bits and don't set any writeback bits - * - * Do set the Private2 bit so we know this page was properly - * setup for writepage - */ - op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; - op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2; - - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, start + ram_size - 1, - locked_page, op); - disk_num_bytes -= cur_alloc_size; - num_bytes -= cur_alloc_size; - alloc_hint = ins.objectid + ins.offset; - start += cur_alloc_size; - } - ret = 0; -out: - btrfs_end_transaction(trans, root); - - return ret; -out_unlock: - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - - goto out; -} - -/* - * work queue call back to started compression on a file and pages - */ -static noinline void async_cow_start(struct btrfs_work *work) -{ - struct async_cow *async_cow; - int num_added = 0; - async_cow = container_of(work, struct async_cow, work); - - compress_file_range(async_cow->inode, async_cow->locked_page, - async_cow->start, async_cow->end, async_cow, - &num_added); - if (num_added == 0) - async_cow->inode = NULL; -} - -/* - * work queue call back to submit previously compressed pages - */ -static noinline void async_cow_submit(struct btrfs_work *work) -{ - struct async_cow *async_cow; - struct btrfs_root *root; - unsigned long nr_pages; - - async_cow = container_of(work, struct async_cow, work); - - root = async_cow->root; - nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - - atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); - - if (atomic_read(&root->fs_info->async_delalloc_pages) < - 5 * 1042 * 1024 && - waitqueue_active(&root->fs_info->async_submit_wait)) - wake_up(&root->fs_info->async_submit_wait); - - if (async_cow->inode) - submit_compressed_extents(async_cow->inode, async_cow); -} - -static noinline void async_cow_free(struct btrfs_work *work) -{ - struct async_cow *async_cow; - async_cow = container_of(work, struct async_cow, work); - kfree(async_cow); -} - -static int cow_file_range_async(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) -{ - struct async_cow *async_cow; - struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long nr_pages; - u64 cur_end; - int limit = 10 * 1024 * 1042; - - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL, GFP_NOFS); - while (start < end) { - async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); - BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = inode; - async_cow->root = root; - async_cow->locked_page = locked_page; - async_cow->start = start; - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) - cur_end = end; - else - cur_end = min(end, start + 512 * 1024 - 1); - - async_cow->end = cur_end; - INIT_LIST_HEAD(&async_cow->extents); - - async_cow->work.func = async_cow_start; - async_cow->work.ordered_func = async_cow_submit; - async_cow->work.ordered_free = async_cow_free; - async_cow->work.flags = 0; - - nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> - PAGE_CACHE_SHIFT; - atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); - - btrfs_queue_worker(&root->fs_info->delalloc_workers, - &async_cow->work); - - if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->async_delalloc_pages) < - limit)); - } - - while (atomic_read(&root->fs_info->async_submit_draining) && - atomic_read(&root->fs_info->async_delalloc_pages)) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->async_delalloc_pages) == - 0)); - } - - *nr_written += nr_pages; - start = cur_end + 1; - } - *page_started = 1; - return 0; -} - -static noinline int csum_exist_in_range(struct btrfs_root *root, - u64 bytenr, u64 num_bytes) -{ - int ret; - struct btrfs_ordered_sum *sums; - LIST_HEAD(list); - - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0); - if (ret == 0 && list_empty(&list)) - return 0; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - return 1; -} - -/* - * when nowcow writeback call back. This checks for snapshots or COW copies - * of the extents that exist in the file, and COWs the file as required. - * - * If no cow copies or snapshots exist, we write directly to the existing - * blocks on disk - */ -static noinline int run_delalloc_nocow(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, int force, - unsigned long *nr_written) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - struct extent_buffer *leaf; - struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct btrfs_key found_key; - u64 cow_start; - u64 cur_offset; - u64 extent_end; - u64 extent_offset; - u64 disk_bytenr; - u64 num_bytes; - int extent_type; - int ret, err; - int type; - int nocow; - int check_prev = 1; - bool nolock; - u64 ino = btrfs_ino(inode); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - nolock = btrfs_is_free_space_inode(root, inode); - - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - cow_start = (u64)-1; - cur_offset = start; - while (1) { - ret = btrfs_lookup_file_extent(trans, root, path, ino, - cur_offset, 0); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto error; - } - if (ret > 0 && path->slots[0] > 0 && check_prev) { - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, - path->slots[0] - 1); - if (found_key.objectid == ino && - found_key.type == BTRFS_EXTENT_DATA_KEY) - path->slots[0]--; - } - check_prev = 0; -next_slot: - leaf = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto error; - } - if (ret > 0) - break; - leaf = path->nodes[0]; - } - - nocow = 0; - disk_bytenr = 0; - num_bytes = 0; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid > ino || - found_key.type > BTRFS_EXTENT_DATA_KEY || - found_key.offset > end) - break; - - if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto out_check; - } - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end <= start) { - path->slots[0]++; - goto next_slot; - } - if (disk_bytenr == 0) - goto out_check; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out_check; - if (extent_type == BTRFS_FILE_EXTENT_REG && !force) - goto out_check; - if (btrfs_extent_readonly(root, disk_bytenr)) - goto out_check; - if (btrfs_cross_ref_exist(trans, root, ino, - found_key.offset - - extent_offset, disk_bytenr)) - goto out_check; - disk_bytenr += extent_offset; - disk_bytenr += cur_offset - found_key.offset; - num_bytes = min(end + 1, extent_end) - cur_offset; - /* - * force cow if csum exists in the range. - * this ensure that csum for a given extent are - * either valid or do not exist. - */ - if (csum_exist_in_range(root, disk_bytenr, num_bytes)) - goto out_check; - nocow = 1; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, fi); - extent_end = ALIGN(extent_end, root->sectorsize); - } else { - BUG_ON(1); - } -out_check: - if (extent_end <= start) { - path->slots[0]++; - goto next_slot; - } - if (!nocow) { - if (cow_start == (u64)-1) - cow_start = cur_offset; - cur_offset = extent_end; - if (cur_offset > end) - break; - path->slots[0]++; - goto next_slot; - } - - btrfs_release_path(path); - if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, - found_key.offset - 1, page_started, - nr_written, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto error; - } - cow_start = (u64)-1; - } - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - struct extent_map *em; - struct extent_map_tree *em_tree; - em_tree = &BTRFS_I(inode)->extent_tree; - em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ - em->start = cur_offset; - em->orig_start = em->start; - em->len = num_bytes; - em->block_len = num_bytes; - em->block_start = disk_bytenr; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, em->start, - em->start + em->len - 1, 0); - } - type = BTRFS_ORDERED_PREALLOC; - } else { - type = BTRFS_ORDERED_NOCOW; - } - - ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, - num_bytes, num_bytes, type); - BUG_ON(ret); /* -ENOMEM */ - - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_reloc_clone_csums(inode, cur_offset, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto error; - } - } - - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2); - cur_offset = extent_end; - if (cur_offset > end) - break; - } - btrfs_release_path(path); - - if (cur_offset <= end && cow_start == (u64)-1) - cow_start = cur_offset; - if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto error; - } - } - -error: - if (nolock) { - err = btrfs_end_transaction_nolock(trans, root); - } else { - err = btrfs_end_transaction(trans, root); - } - if (!ret) - ret = err; - - btrfs_free_path(path); - return ret; -} - -/* - * extent_io.c call back to do delayed allocation processing - */ -static int run_delalloc_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) -{ - int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 1, nr_written); - else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); - else - ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written); - return ret; -} - -static void btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) -{ - /* not delalloc, ignore it */ - if (!(orig->state & EXTENT_DELALLOC)) - return; - - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); -} - -/* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. - */ -static void btrfs_merge_extent_hook(struct inode *inode, - struct extent_state *new, - struct extent_state *other) -{ - /* not delalloc, ignore it */ - if (!(other->state & EXTENT_DELALLOC)) - return; - - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->lock); -} - -/* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. - */ -static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) -{ - - /* - * set_bit and clear bit hooks normally require _irqsave/restore - * but in this case, we are only testing for the DELALLOC - * bit, which is only set or cleared with irqs on - */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len = state->end + 1 - state->start; - bool do_list = !btrfs_is_free_space_inode(root, inode); - - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } - - spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += len; - root->fs_info->delalloc_bytes += len; - if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->fs_info->delalloc_inodes); - } - spin_unlock(&root->fs_info->delalloc_lock); - } -} - -/* - * extent_io.c clear_bit_hook, see set_bit_hook for why - */ -static void btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) -{ - /* - * set_bit and clear bit hooks normally require _irqsave/restore - * but in this case, we are only testing for the DELALLOC - * bit, which is only set or cleared with irqs on - */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len = state->end + 1 - state->start; - bool do_list = !btrfs_is_free_space_inode(root, inode); - - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->lock); - } - - if (*bits & EXTENT_DO_ACCOUNTING) - btrfs_delalloc_release_metadata(inode, len); - - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list) - btrfs_free_reserved_data_space(inode, len); - - spin_lock(&root->fs_info->delalloc_lock); - root->fs_info->delalloc_bytes -= len; - BTRFS_I(inode)->delalloc_bytes -= len; - - if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && - !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_del_init(&BTRFS_I(inode)->delalloc_inodes); - } - spin_unlock(&root->fs_info->delalloc_lock); - } -} - -/* - * extent_io.c merge_bio_hook, this must check the chunk tree to make sure - * we don't create bios that span stripes or chunks - */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags) -{ - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct btrfs_mapping_tree *map_tree; - u64 logical = (u64)bio->bi_sector << 9; - u64 length = 0; - u64 map_length; - int ret; - - if (bio_flags & EXTENT_BIO_COMPRESSED) - return 0; - - length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; - map_length = length; - ret = btrfs_map_block(map_tree, READ, logical, - &map_length, NULL, 0); - /* Will always return 0 or 1 with map_multi == NULL */ - BUG_ON(ret < 0); - if (map_length < length + size) - return 1; - return 0; -} - -/* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -static int __btrfs_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); - BUG_ON(ret); /* -ENOMEM */ - return 0; -} - -/* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - return btrfs_map_bio(root, rw, bio, mirror_num, 1); -} - -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read - */ -static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - int skip_sum; - int metadata = 0; - - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - if (btrfs_is_free_space_inode(root, inode)) - metadata = 2; - - ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); - if (ret) - return ret; - - if (!(rw & REQ_WRITE)) { - if (bio_flags & EXTENT_BIO_COMPRESSED) { - return btrfs_submit_compressed_read(inode, bio, - mirror_num, bio_flags); - } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); - if (ret) - return ret; - } - goto mapit; - } else if (!skip_sum) { - /* csum items have already been cloned */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) - goto mapit; - /* we're doing a write, do the async checksumming */ - return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, - bio_flags, bio_offset, - __btrfs_submit_bio_start, - __btrfs_submit_bio_done); - } - -mapit: - return btrfs_map_bio(root, rw, bio, mirror_num, 0); -} - -/* - * given a list of ordered sums record them in the inode. This happens - * at IO completion time based on sums calculated at bio submission time. - */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, u64 file_offset, - struct list_head *list) -{ - struct btrfs_ordered_sum *sum; - - list_for_each_entry(sum, list, list) { - btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); - } - return 0; -} - -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, - struct extent_state **cached_state) -{ - if ((end & (PAGE_CACHE_SIZE - 1)) == 0) - WARN_ON(1); - return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); -} - -/* see btrfs_writepage_start_hook for details on why this is required */ -struct btrfs_writepage_fixup { - struct page *page; - struct btrfs_work work; -}; - -static void btrfs_writepage_fixup_worker(struct btrfs_work *work) -{ - struct btrfs_writepage_fixup *fixup; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct page *page; - struct inode *inode; - u64 page_start; - u64 page_end; - int ret; - - fixup = container_of(work, struct btrfs_writepage_fixup, work); - page = fixup->page; -again: - lock_page(page); - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { - ClearPageChecked(page); - goto out_page; - } - - inode = page->mapping->host; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; - - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, - &cached_state); - - /* already ordered? We're done */ - if (PagePrivate2(page)) - goto out; - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, - page_end, &cached_state, GFP_NOFS); - unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out; - } - - btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); - ClearPageChecked(page); - set_page_dirty(page); -out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, - &cached_state, GFP_NOFS); -out_page: - unlock_page(page); - page_cache_release(page); - kfree(fixup); -} - -/* - * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a - * good idea. This causes problems because we want to make sure COW - * properly happens and the data=ordered rules are followed. - * - * In our case any range that doesn't have the ORDERED bit set - * hasn't been properly setup for IO. We kick off an async process - * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. - */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) -{ - struct inode *inode = page->mapping->host; - struct btrfs_writepage_fixup *fixup; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* this page is properly in the ordered list */ - if (TestClearPagePrivate2(page)) - return 0; - - if (PageChecked(page)) - return -EAGAIN; - - fixup = kzalloc(sizeof(*fixup), GFP_NOFS); - if (!fixup) - return -EAGAIN; - - SetPageChecked(page); - page_cache_get(page); - fixup->work.func = btrfs_writepage_fixup_worker; - fixup->page = page; - btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); - return -EBUSY; -} - -static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, u64 file_pos, - u64 disk_bytenr, u64 disk_num_bytes, - u64 num_bytes, u64 ram_bytes, - u8 compression, u8 encryption, - u16 other_encoding, int extent_type) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *fi; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key ins; - u64 hint; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - - /* - * we may be replacing one extent in the tree with another. - * The new extent is pinned in the extent map, and we don't want - * to drop it from the cache until it is completely in the btree. - * - * So, tell btrfs_drop_extents to leave this extent in the cache. - * the caller is expected to unpin it and allow it to be merged - * with the others. - */ - ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, - &hint, 0); - if (ret) - goto out; - - ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; - ins.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); - if (ret) - goto out; - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, fi, 0); - btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); - btrfs_set_file_extent_compression(leaf, fi, compression); - btrfs_set_file_extent_encryption(leaf, fi, encryption); - btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); - - btrfs_unlock_up_safe(path, 1); - btrfs_set_lock_blocking(leaf); - - btrfs_mark_buffer_dirty(leaf); - - inode_add_bytes(inode, num_bytes); - - ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root, - root->root_key.objectid, - btrfs_ino(inode), file_pos, &ins); -out: - btrfs_free_path(path); - - return ret; -} - -/* - * helper function for btrfs_finish_ordered_io, this - * just reads in some of the csum leaves to prime them into ram - * before we start the transaction. It limits the amount of btree - * reads required while inside the transaction. - */ -/* as ordered data IO finishes, this gets called so we can finish - * an ordered extent if the range of bytes in the file it covers are - * fully written. - */ -static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_state *cached_state = NULL; - int compress_type = 0; - int ret; - bool nolock; - - ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1); - if (!ret) - return 0; - BUG_ON(!ordered_extent); /* Logic error */ - - nolock = btrfs_is_free_space_inode(root, inode); - - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret) { - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); - } - goto out; - } - - lock_extent_bits(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state); - - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto out_unlock; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) - compress_type = ordered_extent->compress_type; - if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - BUG_ON(compress_type); - ret = btrfs_mark_extent_written(trans, inode, - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len); - } else { - BUG_ON(root == root->fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, - ordered_extent->file_offset, - ordered_extent->start, - ordered_extent->disk_len, - ordered_extent->len, - ordered_extent->len, - compress_type, 0, 0, - BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, - ordered_extent->len); - } - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - add_pending_csums(trans, inode, ordered_extent->file_offset, - &ordered_extent->list); - - ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode_fallback(trans, root, inode); - if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); - goto out; - } - } - ret = 0; -out: - if (root != root->fs_info->tree_root) - btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) { - if (nolock) - btrfs_end_transaction_nolock(trans, root); - else - btrfs_end_transaction(trans, root); - } - - /* once for us */ - btrfs_put_ordered_extent(ordered_extent); - /* once for the tree */ - btrfs_put_ordered_extent(ordered_extent); - - return 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); - goto out; -} - -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) -{ - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - - ClearPagePrivate2(page); - return btrfs_finish_ordered_io(page->mapping->host, start, end); -} - -/* - * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish. If not, the code in - * extent_io.c will try to find good copies for us. - */ -static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror) -{ - size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); - struct inode *inode = page->mapping->host; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - char *kaddr; - u64 private = ~(u32)0; - int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; - u32 csum = ~(u32)0; - - if (PageChecked(page)) { - ClearPageChecked(page); - goto good; - } - - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - goto good; - - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); - return 0; - } - - if (state && state->start == start) { - private = state->private; - ret = 0; - } else { - ret = get_state_private(io_tree, start, &private); - } - kaddr = kmap_atomic(page); - if (ret) - goto zeroit; - - csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); - btrfs_csum_final(csum, (char *)&csum); - if (csum != private) - goto zeroit; - - kunmap_atomic(kaddr); -good: - return 0; - -zeroit: - printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " - "private %llu\n", - (unsigned long long)btrfs_ino(page->mapping->host), - (unsigned long long)start, csum, - (unsigned long long)private); - memset(kaddr + offset, 1, end - start + 1); - flush_dcache_page(page); - kunmap_atomic(kaddr); - if (private == 0) - return 0; - return -EIO; -} - -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ -void btrfs_add_delayed_iput(struct inode *inode) -{ - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; - - if (atomic_add_unless(&inode->i_count, -1, 1)) - return; - - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); -} - -void btrfs_run_delayed_iputs(struct btrfs_root *root) -{ - LIST_HEAD(list); - struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; - - spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; - - down_read(&root->fs_info->cleanup_work_sem); - spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); - - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); - } - up_read(&root->fs_info->cleanup_work_sem); -} - -enum btrfs_orphan_cleanup_state { - ORPHAN_CLEANUP_STARTED = 1, - ORPHAN_CLEANUP_DONE = 2, -}; - -/* - * This is called in transaction commit time. If there are no orphan - * files in the subvolume, it removes orphan item and frees block_rsv - * structure. - */ -void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_block_rsv *block_rsv; - int ret; - - if (!list_empty(&root->orphan_list) || - root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) - return; - - spin_lock(&root->orphan_lock); - if (!list_empty(&root->orphan_list)) { - spin_unlock(&root->orphan_lock); - return; - } - - if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) { - spin_unlock(&root->orphan_lock); - return; - } - - block_rsv = root->orphan_block_rsv; - root->orphan_block_rsv = NULL; - spin_unlock(&root->orphan_lock); - - if (root->orphan_item_inserted && - btrfs_root_refs(&root->root_item) > 0) { - ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, - root->root_key.objectid); - BUG_ON(ret); - root->orphan_item_inserted = 0; - } - - if (block_rsv) { - WARN_ON(block_rsv->size > 0); - btrfs_free_block_rsv(root, block_rsv); - } -} - -/* - * This creates an orphan entry for the given inode in case something goes - * wrong in the middle of an unlink/truncate. - * - * NOTE: caller of this function should reserve 5 units of metadata for - * this function. - */ -int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *block_rsv = NULL; - int reserve = 0; - int insert = 0; - int ret; - - if (!root->orphan_block_rsv) { - block_rsv = btrfs_alloc_block_rsv(root); - if (!block_rsv) - return -ENOMEM; - } - - spin_lock(&root->orphan_lock); - if (!root->orphan_block_rsv) { - root->orphan_block_rsv = block_rsv; - } else if (block_rsv) { - btrfs_free_block_rsv(root, block_rsv); - block_rsv = NULL; - } - - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); -#if 0 - /* - * For proper ENOSPC handling, we should do orphan - * cleanup when mounting. But this introduces backward - * compatibility issue. - */ - if (!xchg(&root->orphan_item_inserted, 1)) - insert = 2; - else - insert = 1; -#endif - insert = 1; - } - - if (!BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 1; - reserve = 1; - } - spin_unlock(&root->orphan_lock); - - /* grab metadata reservation from transaction handle */ - if (reserve) { - ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ - } - - /* insert an orphan item to track this unlinked/truncated file */ - if (insert >= 1) { - ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - ret = 0; - } - - /* insert an orphan item to track subvolume contains orphan files */ - if (insert >= 2) { - ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, - root->root_key.objectid); - if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - } - return 0; -} - -/* - * We have done the truncate/delete so we can go ahead and remove the orphan - * item for this particular inode. - */ -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int delete_item = 0; - int release_rsv = 0; - int ret = 0; - - spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - list_del_init(&BTRFS_I(inode)->i_orphan); - delete_item = 1; - } - - if (BTRFS_I(inode)->orphan_meta_reserved) { - BTRFS_I(inode)->orphan_meta_reserved = 0; - release_rsv = 1; - } - spin_unlock(&root->orphan_lock); - - if (trans && delete_item) { - ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ - } - - if (release_rsv) - btrfs_orphan_release_metadata(inode); - - return 0; -} - -/* - * this cleans up any orphans that may be left on the list from the last use - * of this root. - */ -int btrfs_orphan_cleanup(struct btrfs_root *root) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key, found_key; - struct btrfs_trans_handle *trans; - struct inode *inode; - u64 last_objectid = 0; - int ret = 0, nr_unlink = 0, nr_truncate = 0; - - if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) - return 0; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - path->reada = -1; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); - key.offset = (u64)-1; - - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - /* - * if ret == 0 means we found what we were searching for, which - * is weird, but possible, so only screw with path if we didn't - * find the key and see if we have stuff that matches - */ - if (ret > 0) { - ret = 0; - if (path->slots[0] == 0) - break; - path->slots[0]--; - } - - /* pull out the item */ - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - /* make sure the item matches what we want */ - if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) - break; - if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) - break; - - /* release the path since we're done with it */ - btrfs_release_path(path); - - /* - * this is where we are basically btrfs_lookup, without the - * crossing root thing. we store the inode number in the - * offset of the orphan item. - */ - - if (found_key.offset == last_objectid) { - printk(KERN_ERR "btrfs: Error removing orphan entry, " - "stopping orphan cleanup\n"); - ret = -EINVAL; - goto out; - } - - last_objectid = found_key.offset; - - found_key.objectid = found_key.offset; - found_key.type = BTRFS_INODE_ITEM_KEY; - found_key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - ret = PTR_RET(inode); - if (ret && ret != -ESTALE) - goto out; - - if (ret == -ESTALE && root == root->fs_info->tree_root) { - struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; - int is_dead_root = 0; - - /* - * this is an orphan in the tree root. Currently these - * could come from 2 sources: - * a) a snapshot deletion in progress - * b) a free space cache inode - * We need to distinguish those two, as the snapshot - * orphan must not get deleted. - * find_dead_roots already ran before us, so if this - * is a snapshot deletion, we should find the root - * in the dead_roots list - */ - spin_lock(&fs_info->trans_lock); - list_for_each_entry(dead_root, &fs_info->dead_roots, - root_list) { - if (dead_root->root_key.objectid == - found_key.objectid) { - is_dead_root = 1; - break; - } - } - spin_unlock(&fs_info->trans_lock); - if (is_dead_root) { - /* prevent this orphan from being found again */ - key.offset = found_key.objectid - 1; - continue; - } - } - /* - * Inode is already gone but the orphan item is still there, - * kill the orphan item. - */ - if (ret == -ESTALE) { - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - ret = btrfs_del_orphan_item(trans, root, - found_key.objectid); - BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ - btrfs_end_transaction(trans, root); - continue; - } - - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); - - /* if we have links, this was a truncate, lets do that */ - if (inode->i_nlink) { - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); - iput(inode); - continue; - } - nr_truncate++; - ret = btrfs_truncate(inode); - } else { - nr_unlink++; - } - - /* this will do delete_inode and everything for us */ - iput(inode); - if (ret) - goto out; - } - /* release the path since we're done with it */ - btrfs_release_path(path); - - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; - - if (root->orphan_block_rsv) - btrfs_block_rsv_release(root, root->orphan_block_rsv, - (u64)-1); - - if (root->orphan_block_rsv || root->orphan_item_inserted) { - trans = btrfs_join_transaction(root); - if (!IS_ERR(trans)) - btrfs_end_transaction(trans, root); - } - - if (nr_unlink) - printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); - if (nr_truncate) - printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); - -out: - if (ret) - printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); - btrfs_free_path(path); - return ret; -} - -/* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. - * - * slot is the slot the inode is in, objectid is the objectid of the inode - */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid) -{ - u32 nritems = btrfs_header_nritems(leaf); - struct btrfs_key found_key; - int scanned = 0; - - slot++; - while (slot < nritems) { - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* we found a different objectid, there must not be acls */ - if (found_key.objectid != objectid) - return 0; - - /* we found an xattr, assume we've got an acl */ - if (found_key.type == BTRFS_XATTR_ITEM_KEY) - return 1; - - /* - * we found a key greater than an xattr key, there can't - * be any acls later on - */ - if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; - - slot++; - scanned++; - - /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization - */ - if (scanned >= 8) - break; - } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls - */ - return 1; -} - -/* - * read an inode from the btree into the in-memory inode - */ -static void btrfs_read_locked_inode(struct inode *inode) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_key location; - int maybe_acls; - u32 rdev; - int ret; - bool filled = false; - - ret = btrfs_fill_inode(inode, &rdev); - if (!ret) - filled = true; - - path = btrfs_alloc_path(); - if (!path) - goto make_bad; - - path->leave_spinning = 1; - memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); - - ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) - goto make_bad; - - leaf = path->nodes[0]; - - if (filled) - goto cache_acl; - - inode_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - inode->i_uid = btrfs_inode_uid(leaf, inode_item); - inode->i_gid = btrfs_inode_gid(leaf, inode_item); - btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); - - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); - - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); - - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; - rdev = btrfs_inode_rdev(leaf, inode_item); - - BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); -cache_acl: - /* - * try to precache a NULL acl entry for files that don't have - * any xattrs or acls - */ - maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(inode)); - if (!maybe_acls) - cache_no_acl(inode); - - btrfs_free_path(path); - - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - break; - case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - if (root == root->fs_info->tree_root) - inode->i_op = &btrfs_dir_ro_inode_operations; - else - inode->i_op = &btrfs_dir_inode_operations; - break; - case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - break; - default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); - break; - } - - btrfs_update_iflags(inode); - return; - -make_bad: - btrfs_free_path(path); - make_bad_inode(inode); -} - -/* - * given a leaf and an inode, copy the inode fields into the leaf - */ -static void fill_inode_item(struct btrfs_trans_handle *trans, - struct extent_buffer *leaf, - struct btrfs_inode_item *item, - struct inode *inode) -{ - btrfs_set_inode_uid(leaf, item, inode->i_uid); - btrfs_set_inode_gid(leaf, item, inode->i_gid); - btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); - - btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), - inode->i_atime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), - inode->i_mtime.tv_nsec); - - btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_sec); - btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), - inode->i_ctime.tv_nsec); - - btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); - btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); - btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, 0); -} - -/* - * copy everything in the in-memory inode into the btree. - */ -static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - struct btrfs_inode_item *inode_item; - struct btrfs_path *path; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location, - 1); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto failed; - } - - btrfs_unlock_up_safe(path, 1); - leaf = path->nodes[0]; - inode_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_inode_item); - - fill_inode_item(trans, leaf, inode_item, inode); - btrfs_mark_buffer_dirty(leaf); - btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; -} - -/* - * copy everything in the in-memory inode into the btree. - */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - int ret; - - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - - return btrfs_update_inode_item(trans, root, inode); -} - -static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - int ret; - - ret = btrfs_update_inode(trans, root, inode); - if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, inode); - return ret; -} - -/* - * unlink helper that gets used here in inode.c and in the tree logging - * recovery code. It remove a link in a directory with a given name, and - * also drops the back refs in the inode to the directory - */ -static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len) -{ - struct btrfs_path *path; - int ret = 0; - struct extent_buffer *leaf; - struct btrfs_dir_item *di; - struct btrfs_key key; - u64 index; - u64 ino = btrfs_ino(inode); - u64 dir_ino = btrfs_ino(dir); - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - path->leave_spinning = 1; - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto err; - } - if (!di) { - ret = -ENOENT; - goto err; - } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto err; - btrfs_release_path(path); - - ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, - dir_ino, &index); - if (ret) { - printk(KERN_INFO "btrfs failed to delete reference to %.*s, " - "inode %llu parent %llu\n", name_len, name, - (unsigned long long)ino, (unsigned long long)dir_ino); - btrfs_abort_transaction(trans, root, ret); - goto err; - } - - ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto err; - } - - ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, - inode, dir_ino); - if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); - goto err; - } - - ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, - dir, index); - if (ret == -ENOENT) - ret = 0; -err: - btrfs_free_path(path); - if (ret) - goto out; - - btrfs_i_size_write(dir, dir->i_size - name_len * 2); - inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; - btrfs_update_inode(trans, root, dir); -out: - return ret; -} - -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len) -{ - int ret; - ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - if (!ret) { - btrfs_drop_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); - } - return ret; -} - - -/* helper to check if there is any shared block in the path */ -static int check_path_shared(struct btrfs_root *root, - struct btrfs_path *path) -{ - struct extent_buffer *eb; - int level; - u64 refs = 1; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - int ret; - - if (!path->nodes[level]) - break; - eb = path->nodes[level]; - if (!btrfs_block_can_be_shared(root, eb)) - continue; - ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, - &refs, NULL); - if (refs > 1) - return 1; - } - return 0; -} - -/* - * helper to start transaction for unlink and rmdir. - * - * unlink and rmdir are special in btrfs, they do not always free space. - * so in enospc case, we should make sure they will free space before - * allowing them to use the global metadata reservation. - */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, - struct dentry *dentry) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; - struct inode *inode = dentry->d_inode; - u64 index; - int check_link = 1; - int err = -ENOSPC; - int ret; - u64 ino = btrfs_ino(inode); - u64 dir_ino = btrfs_ino(dir); - - /* - * 1 for the possible orphan item - * 1 for the dir item - * 1 for the dir index - * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log - * 1 for the inode - */ - trans = btrfs_start_transaction(root, 8); - if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) - return trans; - - if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return ERR_PTR(-ENOSPC); - - /* check if there is someone else holds reference */ - if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) - return ERR_PTR(-ENOSPC); - - if (atomic_read(&inode->i_count) > 2) - return ERR_PTR(-ENOSPC); - - if (xchg(&root->fs_info->enospc_unlink, 1)) - return ERR_PTR(-ENOSPC); - - path = btrfs_alloc_path(); - if (!path) { - root->fs_info->enospc_unlink = 0; - return ERR_PTR(-ENOMEM); - } - - /* 1 for the orphan item */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_free_path(path); - root->fs_info->enospc_unlink = 0; - return trans; - } - - path->skip_locking = 1; - path->search_commit_root = 1; - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(dir)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - ret = btrfs_lookup_inode(trans, root, path, - &BTRFS_I(inode)->location, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret == 0) { - if (check_path_shared(root, path)) - goto out; - } else { - check_link = 0; - } - btrfs_release_path(path); - - if (ret == 0 && S_ISREG(inode->i_mode)) { - ret = btrfs_lookup_file_extent(trans, root, path, - ino, (u64)-1, 0); - if (ret < 0) { - err = ret; - goto out; - } - BUG_ON(ret == 0); /* Corruption */ - if (check_path_shared(root, path)) - goto out; - btrfs_release_path(path); - } - - if (!check_link) { - err = 0; - goto out; - } - - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - if (di) { - if (check_path_shared(root, path)) - goto out; - } else { - err = 0; - goto out; - } - btrfs_release_path(path); - - ref = btrfs_lookup_inode_ref(trans, root, path, - dentry->d_name.name, dentry->d_name.len, - ino, dir_ino, 0); - if (IS_ERR(ref)) { - err = PTR_ERR(ref); - goto out; - } - BUG_ON(!ref); /* Logic error */ - if (check_path_shared(root, path)) - goto out; - index = btrfs_inode_ref_index(path->nodes[0], ref); - btrfs_release_path(path); - - /* - * This is a commit root search, if we can lookup inode item and other - * relative items in the commit root, it means the transaction of - * dir/file creation has been committed, and the dir index item that we - * delay to insert has also been inserted into the commit root. So - * we needn't worry about the delayed insertion of the dir index item - * here. - */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index, - dentry->d_name.name, dentry->d_name.len, 0); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto out; - } - BUG_ON(ret == -ENOENT); - if (check_path_shared(root, path)) - goto out; - - err = 0; -out: - btrfs_free_path(path); - /* Migrate the orphan reservation over */ - if (!err) - err = btrfs_block_rsv_migrate(trans->block_rsv, - &root->fs_info->global_block_rsv, - trans->bytes_reserved); - - if (err) { - btrfs_end_transaction(trans, root); - root->fs_info->enospc_unlink = 0; - return ERR_PTR(err); - } - - trans->block_rsv = &root->fs_info->global_block_rsv; - return trans; -} - -static void __unlink_end_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (trans->block_rsv == &root->fs_info->global_block_rsv) { - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); - trans->block_rsv = &root->fs_info->trans_block_rsv; - BUG_ON(!root->fs_info->enospc_unlink); - root->fs_info->enospc_unlink = 0; - } - btrfs_end_transaction(trans, root); -} - -static int btrfs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; - int ret; - unsigned long nr = 0; - - trans = __unlink_start_trans(dir, dentry); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); - - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, - dentry->d_name.name, dentry->d_name.len); - if (ret) - goto out; - - if (inode->i_nlink == 0) { - ret = btrfs_orphan_add(trans, inode); - if (ret) - goto out; - } - -out: - nr = trans->blocks_used; - __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); - return ret; -} - -int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, u64 objectid, - const char *name, int name_len) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_dir_item *di; - struct btrfs_key key; - u64 index; - int ret; - u64 dir_ino = btrfs_ino(dir); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); - goto out; - } - - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); - WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - btrfs_release_path(path); - - ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, - objectid, root->root_key.objectid, - dir_ino, &index, name, name_len); - if (ret < 0) { - if (ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - di = btrfs_search_dir_index_item(root, path, dir_ino, - name, name_len); - if (IS_ERR_OR_NULL(di)) { - if (!di) - ret = -ENOENT; - else - ret = PTR_ERR(di); - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(path); - index = key.offset; - } - btrfs_release_path(path); - - ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - btrfs_i_size_write(dir, dir->i_size - name_len * 2); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, dir); - if (ret) - btrfs_abort_transaction(trans, root, ret); -out: - btrfs_free_path(path); - return ret; -} - -static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - int err = 0; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_trans_handle *trans; - unsigned long nr = 0; - - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || - btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) - return -ENOTEMPTY; - - trans = __unlink_start_trans(dir, dentry); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, root, dir, - BTRFS_I(inode)->location.objectid, - dentry->d_name.name, - dentry->d_name.len); - goto out; - } - - err = btrfs_orphan_add(trans, inode); - if (err) - goto out; - - /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, - dentry->d_name.name, dentry->d_name.len); - if (!err) - btrfs_i_size_write(inode, 0); -out: - nr = trans->blocks_used; - __unlink_end_trans(trans, root); - btrfs_btree_balance_dirty(root, nr); - - return err; -} - -/* - * this can truncate away extent items, csum items and directory items. - * It starts at a high offset and removes keys until it can't find - * any higher than new_size - * - * csum items that cross the new i_size are truncated to the new size - * as well. - * - * min_type is the minimum key type to truncate down to. If set to 0, this - * will kill all the items on this inode, including the INODE_ITEM_KEY. - */ -int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u64 new_size, u32 min_type) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - struct btrfs_key found_key; - u64 extent_start = 0; - u64 extent_num_bytes = 0; - u64 extent_offset = 0; - u64 item_end = 0; - u64 mask = root->sectorsize - 1; - u32 found_type = (u8)-1; - int found_extent; - int del_item; - int pending_del_nr = 0; - int pending_del_slot = 0; - int extent_type = -1; - int ret; - int err = 0; - u64 ino = btrfs_ino(inode); - - BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = -1; - - if (root->ref_cows || root == root->fs_info->tree_root) - btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); - - /* - * This function is also used to drop the items in the log tree before - * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the loged items. So we shouldn't kill the delayed - * items. - */ - if (min_type == 0 && root == BTRFS_I(inode)->root) - btrfs_kill_delayed_inode_items(inode); - - key.objectid = ino; - key.offset = (u64)-1; - key.type = (u8)-1; - -search_again: - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto out; - } - - if (ret > 0) { - /* there are no items in the tree for us to truncate, we're - * done - */ - if (path->slots[0] == 0) - goto out; - path->slots[0]--; - } - - while (1) { - fi = NULL; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); - - if (found_key.objectid != ino) - break; - - if (found_type < min_type) - break; - - item_end = found_key.offset; - if (found_type == BTRFS_EXTENT_DATA_KEY) { - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - item_end += - btrfs_file_extent_num_bytes(leaf, fi); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - item_end += btrfs_file_extent_inline_len(leaf, - fi); - } - item_end--; - } - if (found_type > min_type) { - del_item = 1; - } else { - if (item_end < new_size) - break; - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; - } - found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ - if (found_type != BTRFS_EXTENT_DATA_KEY) - goto delete; - - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { - u64 num_dec; - extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); - if (!del_item) { - u64 orig_num_bytes = - btrfs_file_extent_num_bytes(leaf, fi); - extent_num_bytes = new_size - - found_key.offset + root->sectorsize - 1; - extent_num_bytes = extent_num_bytes & - ~((u64)root->sectorsize - 1); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_num_bytes); - num_dec = (orig_num_bytes - - extent_num_bytes); - if (root->ref_cows && extent_start != 0) - inode_sub_bytes(inode, num_dec); - btrfs_mark_buffer_dirty(leaf); - } else { - extent_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, - fi); - extent_offset = found_key.offset - - btrfs_file_extent_offset(leaf, fi); - - /* FIXME blocksize != 4096 */ - num_dec = btrfs_file_extent_num_bytes(leaf, fi); - if (extent_start != 0) { - found_extent = 1; - if (root->ref_cows) - inode_sub_bytes(inode, num_dec); - } - } - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - /* - * we can't truncate inline items that have had - * special encodings - */ - if (!del_item && - btrfs_file_extent_compression(leaf, fi) == 0 && - btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0) { - u32 size = new_size - found_key.offset; - - if (root->ref_cows) { - inode_sub_bytes(inode, item_end + 1 - - new_size); - } - size = - btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(trans, root, path, - size, 1); - } else if (root->ref_cows) { - inode_sub_bytes(inode, item_end + 1 - - found_key.offset); - } - } -delete: - if (del_item) { - if (!pending_del_nr) { - /* no pending yet, add ourselves */ - pending_del_slot = path->slots[0]; - pending_del_nr = 1; - } else if (pending_del_nr && - path->slots[0] + 1 == pending_del_slot) { - /* hop on the pending chunk */ - pending_del_nr++; - pending_del_slot = path->slots[0]; - } else { - BUG(); - } - } else { - break; - } - if (found_extent && (root->ref_cows || - root == root->fs_info->tree_root)) { - btrfs_set_path_blocking(path); - ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, 0, - btrfs_header_owner(leaf), - ino, extent_offset, 0); - BUG_ON(ret); - } - - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - - if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot) { - if (root->ref_cows && - BTRFS_I(inode)->location.objectid != - BTRFS_FREE_INO_OBJECTID) { - err = -EAGAIN; - goto out; - } - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (ret) { - btrfs_abort_transaction(trans, - root, ret); - goto error; - } - pending_del_nr = 0; - } - btrfs_release_path(path); - goto search_again; - } else { - path->slots[0]--; - } - } -out: - if (pending_del_nr) { - ret = btrfs_del_items(trans, root, path, pending_del_slot, - pending_del_nr); - if (ret) - btrfs_abort_transaction(trans, root, ret); - } -error: - btrfs_free_path(path); - return err; -} - -/* - * taken from block_truncate_page, but does cow as it zeros out - * any bytes left in the last page in the file. - */ -static int btrfs_truncate_page(struct address_space *mapping, loff_t from) -{ - struct inode *inode = mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - char *kaddr; - u32 blocksize = root->sectorsize; - pgoff_t index = from >> PAGE_CACHE_SHIFT; - unsigned offset = from & (PAGE_CACHE_SIZE-1); - struct page *page; - gfp_t mask = btrfs_alloc_write_mask(mapping); - int ret = 0; - u64 page_start; - u64 page_end; - - if ((offset & (blocksize - 1)) == 0) - goto out; - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = -ENOMEM; -again: - page = find_or_create_page(mapping, index, mask); - if (!page) { - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); - goto out; - } - - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; - - if (!PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); - lock_page(page); - if (page->mapping != mapping) { - unlock_page(page); - page_cache_release(page); - goto again; - } - if (!PageUptodate(page)) { - ret = -EIO; - goto out_unlock; - } - } - wait_on_page_writeback(page); - - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); - set_page_extent_mapped(page); - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - 0, 0, &cached_state, GFP_NOFS); - - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, - &cached_state); - if (ret) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); - goto out_unlock; - } - - ret = 0; - if (offset != PAGE_CACHE_SIZE) { - kaddr = kmap(page); - memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); - flush_dcache_page(page); - kunmap(page); - } - ClearPageChecked(page); - set_page_dirty(page); - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, - GFP_NOFS); - -out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); - unlock_page(page); - page_cache_release(page); -out: - return ret; -} - -/* - * This function puts in dummy file extents for the area we're creating a hole - * for. So if we are truncating this file to a larger size we need to insert - * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for - * the range between oldsize and size - */ -int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_state *cached_state = NULL; - u64 mask = root->sectorsize - 1; - u64 hole_start = (oldsize + mask) & ~mask; - u64 block_end = (size + mask) & ~mask; - u64 last_byte; - u64 cur_offset; - u64 hole_size; - int err = 0; - - if (size <= hole_start) - return 0; - - while (1) { - struct btrfs_ordered_extent *ordered; - btrfs_wait_ordered_range(inode, hole_start, - block_end - hole_start); - lock_extent_bits(io_tree, hole_start, block_end - 1, 0, - &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, hole_start); - if (!ordered) - break; - unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state, GFP_NOFS); - btrfs_put_ordered_extent(ordered); - } - - cur_offset = hole_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - block_end - cur_offset, 0); - if (IS_ERR(em)) { - err = PTR_ERR(em); - break; - } - last_byte = min(extent_map_end(em), block_end); - last_byte = (last_byte + mask) & ~mask; - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - u64 hint_byte = 0; - hole_size = last_byte - cur_offset; - - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - break; - } - - err = btrfs_drop_extents(trans, inode, cur_offset, - cur_offset + hole_size, - &hint_byte, 1); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); - break; - } - - err = btrfs_insert_file_extent(trans, root, - btrfs_ino(inode), cur_offset, 0, - 0, hole_size, 0, hole_size, - 0, 0, 0); - if (err) { - btrfs_abort_transaction(trans, root, err); - btrfs_end_transaction(trans, root); - break; - } - - btrfs_drop_extent_cache(inode, hole_start, - last_byte - 1, 0); - - btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); - } - free_extent_map(em); - em = NULL; - cur_offset = last_byte; - if (cur_offset >= block_end) - break; - } - - free_extent_map(em); - unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, - GFP_NOFS); - return err; -} - -static int btrfs_setsize(struct inode *inode, loff_t newsize) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - loff_t oldsize = i_size_read(inode); - int ret; - - if (newsize == oldsize) - return 0; - - if (newsize > oldsize) { - truncate_pagecache(inode, oldsize, newsize); - ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); - ret = btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); - } else { - - /* - * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. - */ - if (newsize == 0) - BTRFS_I(inode)->ordered_data_close = 1; - - /* we don't support swapfiles, so vmtruncate shouldn't fail */ - truncate_setsize(inode, newsize); - ret = btrfs_truncate(inode); - } - - return ret; -} - -static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - int err; - - if (btrfs_root_readonly(root)) - return -EROFS; - - err = inode_change_ok(inode, attr); - if (err) - return err; - - if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); - if (err) - return err; - } - - if (attr->ia_valid) { - setattr_copy(inode, attr); - err = btrfs_dirty_inode(inode); - - if (!err && attr->ia_valid & ATTR_MODE) - err = btrfs_acl_chmod(inode); - } - - return err; -} - -void btrfs_evict_inode(struct inode *inode) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv, *global_rsv; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - unsigned long nr; - int ret; - - trace_btrfs_inode_evict(inode); - - truncate_inode_pages(&inode->i_data, 0); - if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || - btrfs_is_free_space_inode(root, inode))) - goto no_delete; - - if (is_bad_inode(inode)) { - btrfs_orphan_del(NULL, inode); - goto no_delete; - } - /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); - - if (root->fs_info->log_root_recovering) { - BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); - goto no_delete; - } - - if (inode->i_nlink > 0) { - BUG_ON(btrfs_root_refs(&root->root_item) != 0); - goto no_delete; - } - - rsv = btrfs_alloc_block_rsv(root); - if (!rsv) { - btrfs_orphan_del(NULL, inode); - goto no_delete; - } - rsv->size = min_size; - global_rsv = &root->fs_info->global_block_rsv; - - btrfs_i_size_write(inode, 0); - - /* - * This is a bit simpler than btrfs_truncate since - * - * 1) We've already reserved our space for our orphan item in the - * unlink. - * 2) We're going to delete the inode item, so we don't need to update - * it at all. - * - * So we just need to reserve some slack space in case we add bytes when - * doing the truncate. - */ - while (1) { - ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); - - /* - * Try and steal from the global reserve since we will - * likely not use this space anyway, we want to try as - * hard as possible to get this to work. - */ - if (ret) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); - - if (ret) { - printk(KERN_WARNING "Could not get space for a " - "delete, will truncate on mount %d\n", ret); - btrfs_orphan_del(NULL, inode); - btrfs_free_block_rsv(root, rsv); - goto no_delete; - } - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_orphan_del(NULL, inode); - btrfs_free_block_rsv(root, rsv); - goto no_delete; - } - - trans->block_rsv = rsv; - - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -EAGAIN) - break; - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - trans = NULL; - btrfs_btree_balance_dirty(root, nr); - } - - btrfs_free_block_rsv(root, rsv); - - if (ret == 0) { - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); - } - - trans->block_rsv = &root->fs_info->trans_block_rsv; - if (!(root == root->fs_info->tree_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) - btrfs_return_ino(root, btrfs_ino(inode)); - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); -no_delete: - end_writeback(inode); - return; -} - -/* - * this returns the key found in the dir entry in the location pointer. - * If no dir entries were found, location->objectid is 0. - */ -static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, - struct btrfs_key *location) -{ - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; - int ret = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, - namelen, 0); - if (IS_ERR(di)) - ret = PTR_ERR(di); - - if (IS_ERR_OR_NULL(di)) - goto out_err; - - btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); -out: - btrfs_free_path(path); - return ret; -out_err: - location->objectid = 0; - goto out; -} - -/* - * when we hit a tree root in a directory, the btrfs part of the inode - * needs to be changed to reflect the root directory of the tree root. This - * is kind of like crossing a mount point. - */ -static int fixup_tree_root_location(struct btrfs_root *root, - struct inode *dir, - struct dentry *dentry, - struct btrfs_key *location, - struct btrfs_root **sub_root) -{ - struct btrfs_path *path; - struct btrfs_root *new_root; - struct btrfs_root_ref *ref; - struct extent_buffer *leaf; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - - err = -ENOENT; - ret = btrfs_find_root_ref(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid); - if (ret) { - if (ret < 0) - err = ret; - goto out; - } - - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || - btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) - goto out; - - ret = memcmp_extent_buffer(leaf, dentry->d_name.name, - (unsigned long)(ref + 1), - dentry->d_name.len); - if (ret) - goto out; - - btrfs_release_path(path); - - new_root = btrfs_read_fs_root_no_name(root->fs_info, location); - if (IS_ERR(new_root)) { - err = PTR_ERR(new_root); - goto out; - } - - if (btrfs_root_refs(&new_root->root_item) == 0) { - err = -ENOENT; - goto out; - } - - *sub_root = new_root; - location->objectid = btrfs_root_dirid(&new_root->root_item); - location->type = BTRFS_INODE_ITEM_KEY; - location->offset = 0; - err = 0; -out: - btrfs_free_path(path); - return err; -} - -static void inode_tree_add(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_inode *entry; - struct rb_node **p; - struct rb_node *parent; - u64 ino = btrfs_ino(inode); -again: - p = &root->inode_tree.rb_node; - parent = NULL; - - if (inode_unhashed(inode)) - return; - - spin_lock(&root->inode_lock); - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_inode, rb_node); - - if (ino < btrfs_ino(&entry->vfs_inode)) - p = &parent->rb_left; - else if (ino > btrfs_ino(&entry->vfs_inode)) - p = &parent->rb_right; - else { - WARN_ON(!(entry->vfs_inode.i_state & - (I_WILL_FREE | I_FREEING))); - rb_erase(parent, &root->inode_tree); - RB_CLEAR_NODE(parent); - spin_unlock(&root->inode_lock); - goto again; - } - } - rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); - rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); -} - -static void inode_tree_del(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int empty = 0; - - spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - empty = RB_EMPTY_ROOT(&root->inode_tree); - } - spin_unlock(&root->inode_lock); - - /* - * Free space cache has inodes in the tree root, but the tree root has a - * root_refs of 0, so this could end up dropping the tree root as a - * snapshot, so we need the extra !root->fs_info->tree_root check to - * make sure we don't drop it. - */ - if (empty && btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) { - synchronize_srcu(&root->fs_info->subvol_srcu); - spin_lock(&root->inode_lock); - empty = RB_EMPTY_ROOT(&root->inode_tree); - spin_unlock(&root->inode_lock); - if (empty) - btrfs_add_dead_root(root); - } -} - -void btrfs_invalidate_inodes(struct btrfs_root *root) -{ - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; - - WARN_ON(btrfs_root_refs(&root->root_item) != 0); - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(&entry->vfs_inode)) - node = node->rb_left; - else if (objectid > btrfs_ino(&entry->vfs_inode)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(&entry->vfs_inode)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(&entry->vfs_inode) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from - * the inode cache when its usage count - * hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); -} - -static int btrfs_init_locked_inode(struct inode *inode, void *p) -{ - struct btrfs_iget_args *args = p; - inode->i_ino = args->ino; - BTRFS_I(inode)->root = args->root; - btrfs_set_inode_space_info(args->root, inode); - return 0; -} - -static int btrfs_find_actor(struct inode *inode, void *opaque) -{ - struct btrfs_iget_args *args = opaque; - return args->ino == btrfs_ino(inode) && - args->root == BTRFS_I(inode)->root; -} - -static struct inode *btrfs_iget_locked(struct super_block *s, - u64 objectid, - struct btrfs_root *root) -{ - struct inode *inode; - struct btrfs_iget_args args; - args.ino = objectid; - args.root = root; - - inode = iget5_locked(s, objectid, btrfs_find_actor, - btrfs_init_locked_inode, - (void *)&args); - return inode; -} - -/* Get an inode object given its location and corresponding root. - * Returns in *is_new if the inode was read from disk - */ -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) -{ - struct inode *inode; - - inode = btrfs_iget_locked(s, location->objectid, root); - if (!inode) - return ERR_PTR(-ENOMEM); - - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); - btrfs_read_locked_inode(inode); - if (!is_bad_inode(inode)) { - inode_tree_add(inode); - unlock_new_inode(inode); - if (new) - *new = 1; - } else { - unlock_new_inode(inode); - iput(inode); - inode = ERR_PTR(-ESTALE); - } - } - - return inode; -} - -static struct inode *new_simple_dir(struct super_block *s, - struct btrfs_key *key, - struct btrfs_root *root) -{ - struct inode *inode = new_inode(s); - - if (!inode) - return ERR_PTR(-ENOMEM); - - BTRFS_I(inode)->root = root; - memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); - BTRFS_I(inode)->dummy_inode = 1; - - inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &btrfs_dir_ro_inode_operations; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - - return inode; -} - -struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_root *sub_root = root; - struct btrfs_key location; - int index; - int ret = 0; - - if (dentry->d_name.len > BTRFS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - if (unlikely(d_need_lookup(dentry))) { - memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); - kfree(dentry->d_fsdata); - dentry->d_fsdata = NULL; - /* This thing is hashed, drop it for now */ - d_drop(dentry); - } else { - ret = btrfs_inode_by_name(dir, dentry, &location); - } - - if (ret < 0) - return ERR_PTR(ret); - - if (location.objectid == 0) - return NULL; - - if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root, NULL); - return inode; - } - - BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); - - index = srcu_read_lock(&root->fs_info->subvol_srcu); - ret = fixup_tree_root_location(root, dir, dentry, - &location, &sub_root); - if (ret < 0) { - if (ret != -ENOENT) - inode = ERR_PTR(ret); - else - inode = new_simple_dir(dir->i_sb, &location, sub_root); - } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); - } - srcu_read_unlock(&root->fs_info->subvol_srcu, index); - - if (!IS_ERR(inode) && root != sub_root) { - down_read(&root->fs_info->cleanup_work_sem); - if (!(inode->i_sb->s_flags & MS_RDONLY)) - ret = btrfs_orphan_cleanup(sub_root); - up_read(&root->fs_info->cleanup_work_sem); - if (ret) - inode = ERR_PTR(ret); - } - - return inode; -} - -static int btrfs_dentry_delete(const struct dentry *dentry) -{ - struct btrfs_root *root; - struct inode *inode = dentry->d_inode; - - if (!inode && !IS_ROOT(dentry)) - inode = dentry->d_parent->d_inode; - - if (inode) { - root = BTRFS_I(inode)->root; - if (btrfs_root_refs(&root->root_item) == 0) - return 1; - - if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return 1; - } - return 0; -} - -static void btrfs_dentry_release(struct dentry *dentry) -{ - if (dentry->d_fsdata) - kfree(dentry->d_fsdata); -} - -static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) -{ - struct dentry *ret; - - ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); - if (unlikely(d_need_lookup(dentry))) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_NEED_LOOKUP; - spin_unlock(&dentry->d_lock); - } - return ret; -} - -unsigned char btrfs_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static int btrfs_real_readdir(struct file *filp, void *dirent, - filldir_t filldir) -{ - struct inode *inode = filp->f_dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_item *item; - struct btrfs_dir_item *di; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - struct list_head ins_list; - struct list_head del_list; - int ret; - struct extent_buffer *leaf; - int slot; - unsigned char d_type; - int over = 0; - u32 di_cur; - u32 di_total; - u32 di_len; - int key_type = BTRFS_DIR_INDEX_KEY; - char tmp_name[32]; - char *name_ptr; - int name_len; - int is_curr = 0; /* filp->f_pos points to the current index? */ - - /* FIXME, use a real flag for deciding about the key type */ - if (root->fs_info->tree_root == root) - key_type = BTRFS_DIR_ITEM_KEY; - - /* special case for "." */ - if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, - filp->f_pos, btrfs_ino(inode), DT_DIR); - if (over) - return 0; - filp->f_pos = 1; - } - /* special case for .., just use the back ref */ - if (filp->f_pos == 1) { - u64 pino = parent_ino(filp->f_path.dentry); - over = filldir(dirent, "..", 2, - filp->f_pos, pino, DT_DIR); - if (over) - return 0; - filp->f_pos = 2; - } - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = 1; - - if (key_type == BTRFS_DIR_INDEX_KEY) { - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); - btrfs_get_delayed_items(inode, &ins_list, &del_list); - } - - btrfs_set_key_type(&key, key_type); - key.offset = filp->f_pos; - key.objectid = btrfs_ino(inode); - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - item = btrfs_item_nr(leaf, slot); - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - if (found_key.objectid != key.objectid) - break; - if (btrfs_key_type(&found_key) != key_type) - break; - if (found_key.offset < filp->f_pos) - goto next; - if (key_type == BTRFS_DIR_INDEX_KEY && - btrfs_should_delete_dir_index(&del_list, - found_key.offset)) - goto next; - - filp->f_pos = found_key.offset; - is_curr = 1; - - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - di_cur = 0; - di_total = btrfs_item_size(leaf, item); - - while (di_cur < di_total) { - struct btrfs_key location; - - if (verify_dir_item(root, leaf, di)) - break; - - name_len = btrfs_dir_name_len(leaf, di); - if (name_len <= sizeof(tmp_name)) { - name_ptr = tmp_name; - } else { - name_ptr = kmalloc(name_len, GFP_NOFS); - if (!name_ptr) { - ret = -ENOMEM; - goto err; - } - } - read_extent_buffer(leaf, name_ptr, - (unsigned long)(di + 1), name_len); - - d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); - - - /* is this a reference to our own snapshot? If so - * skip it. - * - * In contrast to old kernels, we insert the snapshot's - * dir item and dir index after it has been created, so - * we won't find a reference to our own snapshot. We - * still keep the following code for backward - * compatibility. - */ - if (location.type == BTRFS_ROOT_ITEM_KEY && - location.objectid == root->root_key.objectid) { - over = 0; - goto skip; - } - over = filldir(dirent, name_ptr, name_len, - found_key.offset, location.objectid, - d_type); - -skip: - if (name_ptr != tmp_name) - kfree(name_ptr); - - if (over) - goto nopos; - di_len = btrfs_dir_name_len(leaf, di) + - btrfs_dir_data_len(leaf, di) + sizeof(*di); - di_cur += di_len; - di = (struct btrfs_dir_item *)((char *)di + di_len); - } -next: - path->slots[0]++; - } - - if (key_type == BTRFS_DIR_INDEX_KEY) { - if (is_curr) - filp->f_pos++; - ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, - &ins_list); - if (ret) - goto nopos; - } - - /* Reached end of directory/root. Bump pos past the last item. */ - if (key_type == BTRFS_DIR_INDEX_KEY) - /* - * 32-bit glibc will use getdents64, but then strtol - - * so the last number we can serve is this. - */ - filp->f_pos = 0x7fffffff; - else - filp->f_pos++; -nopos: - ret = 0; -err: - if (key_type == BTRFS_DIR_INDEX_KEY) - btrfs_put_delayed_items(&ins_list, &del_list); - btrfs_free_path(path); - return ret; -} - -int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret = 0; - bool nolock = false; - - if (BTRFS_I(inode)->dummy_inode) - return 0; - - if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode)) - nolock = true; - - if (wbc->sync_mode == WB_SYNC_ALL) { - if (nolock) - trans = btrfs_join_transaction_nolock(root); - else - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (nolock) - ret = btrfs_end_transaction_nolock(trans, root); - else - ret = btrfs_commit_transaction(trans, root); - } - return ret; -} - -/* - * This is somewhat expensive, updating the tree every time the - * inode changes. But, it is most likely to find the inode in cache. - * FIXME, needs more benchmarking...there are no reasons other than performance - * to keep or drop this code. - */ -int btrfs_dirty_inode(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret; - - if (BTRFS_I(inode)->dummy_inode) - return 0; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_update_inode(trans, root, inode); - if (ret && ret == -ENOSPC) { - /* whoops, lets try again with the full transaction */ - btrfs_end_transaction(trans, root); - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_update_inode(trans, root, inode); - } - btrfs_end_transaction(trans, root); - if (BTRFS_I(inode)->delayed_node) - btrfs_balance_delayed_items(root); - - return ret; -} - -/* - * This is a copy of file_update_time. We need this so we can return error on - * ENOSPC for updating the inode in the case of file write and mmap writes. - */ -int btrfs_update_time(struct file *file) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct timespec now; - int ret; - enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; - - /* First try to exhaust all avenues to not sync */ - if (IS_NOCMTIME(inode)) - return 0; - - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) - sync_it = S_MTIME; - - if (!timespec_equal(&inode->i_ctime, &now)) - sync_it |= S_CTIME; - - if (IS_I_VERSION(inode)) - sync_it |= S_VERSION; - - if (!sync_it) - return 0; - - /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) - return 0; - - /* Only change inode inside the lock region */ - if (sync_it & S_VERSION) - inode_inc_iversion(inode); - if (sync_it & S_CTIME) - inode->i_ctime = now; - if (sync_it & S_MTIME) - inode->i_mtime = now; - ret = btrfs_dirty_inode(inode); - if (!ret) - mark_inode_dirty_sync(inode); - mnt_drop_write(file->f_path.mnt); - return ret; -} - -/* - * find the highest existing sequence number in a directory - * and then set the in-memory index_cnt variable to reflect - * free sequence numbers - */ -static int btrfs_set_inode_index_count(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_key key, found_key; - struct btrfs_path *path; - struct extent_buffer *leaf; - int ret; - - key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - /* FIXME: we should be able to handle this */ - if (ret == 0) - goto out; - ret = 0; - - /* - * MAGIC NUMBER EXPLANATION: - * since we search a directory based on f_pos we have to start at 2 - * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody - * else has to start at 2 - */ - if (path->slots[0] == 0) { - BTRFS_I(inode)->index_cnt = 2; - goto out; - } - - path->slots[0]--; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != btrfs_ino(inode) || - btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { - BTRFS_I(inode)->index_cnt = 2; - goto out; - } - - BTRFS_I(inode)->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; -} - -/* - * helper to find a free sequence number in a given directory. This current - * code is very simple, later versions will do smarter things in the btree - */ -int btrfs_set_inode_index(struct inode *dir, u64 *index) -{ - int ret = 0; - - if (BTRFS_I(dir)->index_cnt == (u64)-1) { - ret = btrfs_inode_delayed_dir_index_count(dir); - if (ret) { - ret = btrfs_set_inode_index_count(dir); - if (ret) - return ret; - } - } - - *index = BTRFS_I(dir)->index_cnt; - BTRFS_I(dir)->index_cnt++; - - return ret; -} - -static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, - const char *name, int name_len, - u64 ref_objectid, u64 objectid, - umode_t mode, u64 *index) -{ - struct inode *inode; - struct btrfs_inode_item *inode_item; - struct btrfs_key *location; - struct btrfs_path *path; - struct btrfs_inode_ref *ref; - struct btrfs_key key[2]; - u32 sizes[2]; - unsigned long ptr; - int ret; - int owner; - - path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); - - inode = new_inode(root->fs_info->sb); - if (!inode) { - btrfs_free_path(path); - return ERR_PTR(-ENOMEM); - } - - /* - * we have to initialize this early, so we can reclaim the inode - * number if we fail afterwards in this function. - */ - inode->i_ino = objectid; - - if (dir) { - trace_btrfs_inode_request(dir); - - ret = btrfs_set_inode_index(dir, index); - if (ret) { - btrfs_free_path(path); - iput(inode); - return ERR_PTR(ret); - } - } - /* - * index_cnt is ignored for everything but a dir, - * btrfs_get_inode_index_count has an explanation for the magic - * number - */ - BTRFS_I(inode)->index_cnt = 2; - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->generation = trans->transid; - inode->i_generation = BTRFS_I(inode)->generation; - btrfs_set_inode_space_info(root, inode); - - if (S_ISDIR(mode)) - owner = 0; - else - owner = 1; - - key[0].objectid = objectid; - btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); - key[0].offset = 0; - - key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); - key[1].offset = ref_objectid; - - sizes[0] = sizeof(struct btrfs_inode_item); - sizes[1] = name_len + sizeof(*ref); - - path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); - if (ret != 0) - goto fail; - - inode_init_owner(inode, dir, mode); - inode_set_bytes(inode, 0); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode); - - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); - - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); - - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - - btrfs_inherit_iflags(inode, dir); - - if (S_ISREG(mode)) { - if (btrfs_test_opt(root, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW) || - (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - } - - insert_inode_hash(inode); - inode_tree_add(inode); - - trace_btrfs_inode_new(inode); - btrfs_set_inode_last_trans(trans, inode); - - return inode; -fail: - if (dir) - BTRFS_I(dir)->index_cnt--; - btrfs_free_path(path); - iput(inode); - return ERR_PTR(ret); -} - -static inline u8 btrfs_inode_type(struct inode *inode) -{ - return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; -} - -/* - * utility function to add 'inode' into 'parent_inode' with - * a give name and a given sequence number. - * if 'add_backref' is true, also insert a backref from the - * inode to the parent directory. - */ -int btrfs_add_link(struct btrfs_trans_handle *trans, - struct inode *parent_inode, struct inode *inode, - const char *name, int name_len, int add_backref, u64 index) -{ - int ret = 0; - struct btrfs_key key; - struct btrfs_root *root = BTRFS_I(parent_inode)->root; - u64 ino = btrfs_ino(inode); - u64 parent_ino = btrfs_ino(parent_inode); - - if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); - } else { - key.objectid = ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; - } - - if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - key.objectid, root->root_key.objectid, - parent_ino, index, name, name_len); - } else if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, - parent_ino, index); - } - - /* Nothing to clean up yet */ - if (ret) - return ret; - - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode, &key, - btrfs_inode_type(inode), index); - if (ret == -EEXIST) - goto fail_dir_item; - else if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } - - btrfs_i_size_write(parent_inode, parent_inode->i_size + - name_len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, parent_inode); - if (ret) - btrfs_abort_transaction(trans, root, ret); - return ret; - -fail_dir_item: - if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - u64 local_index; - int err; - err = btrfs_del_root_ref(trans, root->fs_info->tree_root, - key.objectid, root->root_key.objectid, - parent_ino, &local_index, name, name_len); - - } else if (add_backref) { - u64 local_index; - int err; - - err = btrfs_del_inode_ref(trans, root, name, name_len, - ino, parent_ino, &local_index); - } - return ret; -} - -static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry, - struct inode *inode, int backref, u64 index) -{ - int err = btrfs_add_link(trans, dir, inode, - dentry->d_name.name, dentry->d_name.len, - backref, index); - if (err > 0) - err = -EEXIST; - return err; -} - -static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - int err; - int drop_inode = 0; - u64 objectid; - unsigned long nr = 0; - u64 index = 0; - - if (!new_valid_dev(rdev)) - return -EINVAL; - - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_find_free_ino(root, &objectid); - if (err) - goto out_unlock; - - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_unlock; - } - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - - inode->i_op = &btrfs_special_inode_operations; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - drop_inode = 1; - else { - init_special_inode(inode, inode->i_mode, rdev); - btrfs_update_inode(trans, root, inode); - d_instantiate(dentry, inode); - } -out_unlock: - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - return err; -} - -static int btrfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, struct nameidata *nd) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - int drop_inode = 0; - int err; - unsigned long nr = 0; - u64 objectid; - u64 index = 0; - - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_find_free_ino(root, &objectid); - if (err) - goto out_unlock; - - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_unlock; - } - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - d_instantiate(dentry, inode); - } -out_unlock: - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - btrfs_btree_balance_dirty(root, nr); - return err; -} - -static int btrfs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = old_dentry->d_inode; - u64 index; - unsigned long nr = 0; - int err; - int drop_inode = 0; - - /* do not allow sys_link's with other subvols of the same device */ - if (root->objectid != BTRFS_I(inode)->root->objectid) - return -EXDEV; - - if (inode->i_nlink == ~0U) - return -EMLINK; - - err = btrfs_set_inode_index(dir, &index); - if (err) - goto fail; - - /* - * 2 items for inode and inode ref - * 2 items for dir items - * 1 item for parent inode - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto fail; - } - - btrfs_inc_nlink(inode); - inode->i_ctime = CURRENT_TIME; - ihold(inode); - - err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); - - if (err) { - drop_inode = 1; - } else { - struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, inode); - if (err) - goto fail; - d_instantiate(dentry, inode); - btrfs_log_new_name(trans, inode, NULL, parent); - } - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); -fail: - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - btrfs_btree_balance_dirty(root, nr); - return err; -} - -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - int err = 0; - int drop_on_err = 0; - u64 objectid = 0; - u64 index = 0; - unsigned long nr = 1; - - /* - * 2 items for inode and ref - * 2 items for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_find_free_ino(root, &objectid); - if (err) - goto out_fail; - - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - S_IFDIR | mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_fail; - } - - drop_on_err = 1; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_fail; - - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - btrfs_i_size_write(inode, 0); - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_fail; - - err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, - dentry->d_name.len, 0, index); - if (err) - goto out_fail; - - d_instantiate(dentry, inode); - drop_on_err = 0; - -out_fail: - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - if (drop_on_err) - iput(inode); - btrfs_btree_balance_dirty(root, nr); - return err; -} - -/* helper for btfs_get_extent. Given an existing extent in the tree, - * and an extent that you want to insert, deal with overlap and insert - * the new extent into the tree. - */ -static int merge_extent_mapping(struct extent_map_tree *em_tree, - struct extent_map *existing, - struct extent_map *em, - u64 map_start, u64 map_len) -{ - u64 start_diff; - - BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); - start_diff = map_start - em->start; - em->start = map_start; - em->len = map_len; - if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - em->block_start += start_diff; - em->block_len -= start_diff; - } - return add_extent_mapping(em_tree, em); -} - -static noinline int uncompress_inline(struct btrfs_path *path, - struct inode *inode, struct page *page, - size_t pg_offset, u64 extent_offset, - struct btrfs_file_extent_item *item) -{ - int ret; - struct extent_buffer *leaf = path->nodes[0]; - char *tmp; - size_t max_size; - unsigned long inline_size; - unsigned long ptr; - int compress_type; - - WARN_ON(pg_offset != 0); - compress_type = btrfs_file_extent_compression(leaf, item); - max_size = btrfs_file_extent_ram_bytes(leaf, item); - inline_size = btrfs_file_extent_inline_item_len(leaf, - btrfs_item_nr(leaf, path->slots[0])); - tmp = kmalloc(inline_size, GFP_NOFS); - if (!tmp) - return -ENOMEM; - ptr = btrfs_file_extent_inline_start(item); - - read_extent_buffer(leaf, tmp, ptr, inline_size); - - max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, - extent_offset, inline_size, max_size); - if (ret) { - char *kaddr = kmap_atomic(page); - unsigned long copy_size = min_t(u64, - PAGE_CACHE_SIZE - pg_offset, - max_size - extent_offset); - memset(kaddr + pg_offset, 0, copy_size); - kunmap_atomic(kaddr); - } - kfree(tmp); - return 0; -} - -/* - * a bit scary, this does extent mapping from logical file offset to the disk. - * the ugly parts come from merging extents from the disk with the in-ram - * representation. This gets more complex because of the data=ordered code, - * where the in-ram extents might be locked pending data=ordered completion. - * - * This also copies inline extents directly into the page. - */ - -struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) -{ - int ret; - int err = 0; - u64 bytenr; - u64 extent_start = 0; - u64 extent_end = 0; - u64 objectid = btrfs_ino(inode); - u32 found_type; - struct btrfs_path *path = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *item; - struct extent_buffer *leaf; - struct btrfs_key found_key; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_trans_handle *trans = NULL; - int compress_type; - -again: - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) - em->bdev = root->fs_info->fs_devices->latest_bdev; - read_unlock(&em_tree->lock); - - if (em) { - if (em->start > start || em->start + em->len <= start) - free_extent_map(em); - else if (em->block_start == EXTENT_MAP_INLINE && page) - free_extent_map(em); - else - goto out; - } - em = alloc_extent_map(); - if (!em) { - err = -ENOMEM; - goto out; - } - em->bdev = root->fs_info->fs_devices->latest_bdev; - em->start = EXTENT_MAP_HOLE; - em->orig_start = EXTENT_MAP_HOLE; - em->len = (u64)-1; - em->block_len = (u64)-1; - - if (!path) { - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - /* - * Chances are we'll be called again, so go ahead and do - * readahead - */ - path->reada = 1; - } - - ret = btrfs_lookup_file_extent(trans, root, path, - objectid, start, trans != NULL); - if (ret < 0) { - err = ret; - goto out; - } - - if (ret != 0) { - if (path->slots[0] == 0) - goto not_found; - path->slots[0]--; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - /* are we inside the extent that was found? */ - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); - if (found_key.objectid != objectid || - found_type != BTRFS_EXTENT_DATA_KEY) { - goto not_found; - } - - found_type = btrfs_file_extent_type(leaf, item); - extent_start = found_key.offset; - compress_type = btrfs_file_extent_compression(leaf, item); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = extent_start + - btrfs_file_extent_num_bytes(leaf, item); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size_t size; - size = btrfs_file_extent_inline_len(leaf, item); - extent_end = (extent_start + size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - - if (start >= extent_end) { - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) - goto not_found; - leaf = path->nodes[0]; - } - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != objectid || - found_key.type != BTRFS_EXTENT_DATA_KEY) - goto not_found; - if (start + len <= found_key.offset) - goto not_found; - em->start = start; - em->len = found_key.offset - start; - goto not_found_em; - } - - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - em->start = extent_start; - em->len = extent_end - extent_start; - em->orig_start = extent_start - - btrfs_file_extent_offset(leaf, item); - bytenr = btrfs_file_extent_disk_bytenr(leaf, item); - if (bytenr == 0) { - em->block_start = EXTENT_MAP_HOLE; - goto insert; - } - if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - em->block_start = bytenr; - em->block_len = btrfs_file_extent_disk_num_bytes(leaf, - item); - } else { - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; - em->block_len = em->len; - if (found_type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); - } - goto insert; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - unsigned long ptr; - char *map; - size_t size; - size_t extent_offset; - size_t copy_size; - - em->block_start = EXTENT_MAP_INLINE; - if (!page || create) { - em->start = extent_start; - em->len = extent_end - extent_start; - goto out; - } - - size = btrfs_file_extent_inline_len(leaf, item); - extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, - size - extent_offset); - em->start = extent_start + extent_offset; - em->len = (copy_size + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - em->orig_start = EXTENT_MAP_INLINE; - if (compress_type) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } - ptr = btrfs_file_extent_inline_start(item) + extent_offset; - if (create == 0 && !PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) != - BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, inode, page, - pg_offset, - extent_offset, item); - BUG_ON(ret); /* -ENOMEM */ - } else { - map = kmap(page); - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - if (pg_offset + copy_size < PAGE_CACHE_SIZE) { - memset(map + pg_offset + copy_size, 0, - PAGE_CACHE_SIZE - pg_offset - - copy_size); - } - kunmap(page); - } - flush_dcache_page(page); - } else if (create && PageUptodate(page)) { - BUG(); - if (!trans) { - kunmap(page); - free_extent_map(em); - em = NULL; - - btrfs_release_path(path); - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) - return ERR_CAST(trans); - goto again; - } - map = kmap(page); - write_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - kunmap(page); - btrfs_mark_buffer_dirty(leaf); - } - set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, NULL, GFP_NOFS); - goto insert; - } else { - printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); - WARN_ON(1); - } -not_found: - em->start = start; - em->len = len; -not_found_em: - em->block_start = EXTENT_MAP_HOLE; - set_bit(EXTENT_FLAG_VACANCY, &em->flags); -insert: - btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { - printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " - "[%llu %llu]\n", (unsigned long long)em->start, - (unsigned long long)em->len, - (unsigned long long)start, - (unsigned long long)len); - err = -EIO; - goto out; - } - - err = 0; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - /* it is possible that someone inserted the extent into the tree - * while we had the lock dropped. It is also possible that - * an overlapping map exists in the tree - */ - if (ret == -EEXIST) { - struct extent_map *existing; - - ret = 0; - - existing = lookup_extent_mapping(em_tree, start, len); - if (existing && (existing->start > start || - existing->start + existing->len <= start)) { - free_extent_map(existing); - existing = NULL; - } - if (!existing) { - existing = lookup_extent_mapping(em_tree, em->start, - em->len); - if (existing) { - err = merge_extent_mapping(em_tree, existing, - em, start, - root->sectorsize); - free_extent_map(existing); - if (err) { - free_extent_map(em); - em = NULL; - } - } else { - err = -EIO; - free_extent_map(em); - em = NULL; - } - } else { - free_extent_map(em); - em = existing; - err = 0; - } - } - write_unlock(&em_tree->lock); -out: - - trace_btrfs_get_extent(root, em); - - if (path) - btrfs_free_path(path); - if (trans) { - ret = btrfs_end_transaction(trans, root); - if (!err) - err = ret; - } - if (err) { - free_extent_map(em); - return ERR_PTR(err); - } - BUG_ON(!em); /* Error is always set */ - return em; -} - -struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) -{ - struct extent_map *em; - struct extent_map *hole_em = NULL; - u64 range_start = start; - u64 end; - u64 found; - u64 found_end; - int err = 0; - - em = btrfs_get_extent(inode, page, pg_offset, start, len, create); - if (IS_ERR(em)) - return em; - if (em) { - /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it - */ - if (em->block_start != EXTENT_MAP_HOLE) - return em; - else - hole_em = em; - } - - /* check to see if we've wrapped (len == -1 or similar) */ - end = start + len; - if (end < start) - end = (u64)-1; - else - end -= 1; - - em = NULL; - - /* ok, we didn't find anything, lets look for delalloc */ - found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, - end, len, EXTENT_DELALLOC, 1); - found_end = range_start + found; - if (found_end < range_start) - found_end = (u64)-1; - - /* - * we didn't find anything useful, return - * the original results from get_extent() - */ - if (range_start > end || found_end <= start) { - em = hole_em; - hole_em = NULL; - goto out; - } - - /* adjust the range_start to make sure it doesn't - * go backwards from the start they passed in - */ - range_start = max(start,range_start); - found = found_end - range_start; - - if (found > 0) { - u64 hole_start = start; - u64 hole_len = len; - - em = alloc_extent_map(); - if (!em) { - err = -ENOMEM; - goto out; - } - /* - * when btrfs_get_extent can't find anything it - * returns one huge hole - * - * make sure what it found really fits our range, and - * adjust to make sure it is based on the start from - * the caller - */ - if (hole_em) { - u64 calc_end = extent_map_end(hole_em); - - if (calc_end <= start || (hole_em->start > end)) { - free_extent_map(hole_em); - hole_em = NULL; - } else { - hole_start = max(hole_em->start, start); - hole_len = calc_end - hole_start; - } - } - em->bdev = NULL; - if (hole_em && range_start > hole_start) { - /* our hole starts before our delalloc, so we - * have to return just the parts of the hole - * that go until the delalloc starts - */ - em->len = min(hole_len, - range_start - hole_start); - em->start = hole_start; - em->orig_start = hole_start; - /* - * don't adjust block start at all, - * it is fixed at EXTENT_MAP_HOLE - */ - em->block_start = hole_em->block_start; - em->block_len = hole_len; - } else { - em->start = range_start; - em->len = found; - em->orig_start = range_start; - em->block_start = EXTENT_MAP_DELALLOC; - em->block_len = found; - } - } else if (hole_em) { - return hole_em; - } -out: - - free_extent_map(hole_em); - if (err) { - free_extent_map(em); - return ERR_PTR(err); - } - return em; -} - -static struct extent_map *btrfs_new_extent_direct(struct inode *inode, - struct extent_map *em, - u64 start, u64 len) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct btrfs_key ins; - u64 alloc_hint; - int ret; - bool insert = false; - - /* - * Ok if the extent map we looked up is a hole and is for the exact - * range we want, there is no reason to allocate a new one, however if - * it is not right then we need to free this one and drop the cache for - * our range. - */ - if (em->block_start != EXTENT_MAP_HOLE || em->start != start || - em->len != len) { - free_extent_map(em); - em = NULL; - insert = true; - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return ERR_CAST(trans); - - if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024) - btrfs_add_inode_defrag(trans, inode); - - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, - alloc_hint, &ins, 1); - if (ret) { - em = ERR_PTR(ret); - goto out; - } - - if (!em) { - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - } - - em->start = start; - em->orig_start = em->start; - em->len = ins.offset; - - em->block_start = ins.objectid; - em->block_len = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; - - /* - * We need to do this because if we're using the original em we searched - * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. - */ - em->flags = 0; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - while (insert) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) - break; - btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); - } - - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset); - em = ERR_PTR(ret); - } -out: - btrfs_end_transaction(trans, root); - return em; -} - -/* - * returns 1 when the nocow is safe, < 1 on error, 0 if the - * block must be cow'd - */ -static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 len) -{ - struct btrfs_path *path; - int ret; - struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *fi; - struct btrfs_key key; - u64 disk_bytenr; - u64 backref_offset; - u64 extent_end; - u64 num_bytes; - int slot; - int found_type; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), - offset, 0); - if (ret < 0) - goto out; - - slot = path->slots[0]; - if (ret == 1) { - if (slot == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; - } - slot--; - } - ret = 0; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; - } - - if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; - } - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, fi); - if (found_type != BTRFS_FILE_EXTENT_REG && - found_type != BTRFS_FILE_EXTENT_PREALLOC) { - /* not a regular extent, must cow */ - goto out; - } - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - backref_offset = btrfs_file_extent_offset(leaf, fi); - - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end < offset + len) { - /* extent doesn't include our full range, must cow */ - goto out; - } - - if (btrfs_extent_readonly(root, disk_bytenr)) - goto out; - - /* - * look for other files referencing this extent, if we - * find any we must cow - */ - if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), - key.offset - backref_offset, disk_bytenr)) - goto out; - - /* - * adjust disk_bytenr and num_bytes to cover just the bytes - * in this extent we are about to write. If there - * are any csums in that range we have to cow in order - * to keep the csums correct - */ - disk_bytenr += backref_offset; - disk_bytenr += offset - key.offset; - num_bytes = min(offset + len, extent_end) - offset; - if (csum_exist_in_range(root, disk_bytenr, num_bytes)) - goto out; - /* - * all of the above have passed, it is safe to overwrite this extent - * without cow - */ - ret = 1; -out: - btrfs_free_path(path); - return ret; -} - -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 start = iblock << inode->i_blkbits; - u64 len = bh_result->b_size; - struct btrfs_trans_handle *trans; - - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - if (IS_ERR(em)) - return PTR_ERR(em); - - /* - * Ok for INLINE and COMPRESSED extents we need to fallback on buffered - * io. INLINE is special, and we could probably kludge it in here, but - * it's still buffered so for safety lets just fall back to the generic - * buffered path. - * - * For COMPRESSED we _have_ to read the entire extent in so we can - * decompress it, so there will be buffering required no matter what we - * do, so go ahead and fallback to buffered. - * - * We return -ENOTBLK because thats what makes DIO go ahead and go back - * to buffered IO. Don't blame me, this is the price we pay for using - * the generic code. - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || - em->block_start == EXTENT_MAP_INLINE) { - free_extent_map(em); - return -ENOTBLK; - } - - /* Just a good old fashioned hole, return */ - if (!create && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - free_extent_map(em); - /* DIO will do one hole at a time, so just unlock a sector */ - unlock_extent(&BTRFS_I(inode)->io_tree, start, - start + root->sectorsize - 1); - return 0; - } - - /* - * We don't allocate a new extent in the following cases - * - * 1) The inode is marked as NODATACOW. In this case we'll just use the - * existing extent. - * 2) The extent is marked as PREALLOC. We're good to go here and can - * just use the extent. - * - */ - if (!create) { - len = em->len - (start - em->start); - goto map; - } - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || - ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - em->block_start != EXTENT_MAP_HOLE)) { - int type; - int ret; - u64 block_start; - - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - type = BTRFS_ORDERED_PREALLOC; - else - type = BTRFS_ORDERED_NOCOW; - len = min(len, em->len - (start - em->start)); - block_start = em->block_start + (start - em->start); - - /* - * we're not going to log anything, but we do need - * to make sure the current transaction stays open - * while we look for nocow cross refs - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto must_cow; - - if (can_nocow_odirect(trans, inode, start, len) == 1) { - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - btrfs_end_transaction(trans, root); - if (ret) { - free_extent_map(em); - return ret; - } - goto unlock; - } - btrfs_end_transaction(trans, root); - } -must_cow: - /* - * this will cow the extent, reset the len in case we changed - * it above - */ - len = bh_result->b_size; - em = btrfs_new_extent_direct(inode, em, start, len); - if (IS_ERR(em)) - return PTR_ERR(em); - len = min(len, em->len - (start - em->start)); -unlock: - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, - EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, - 0, NULL, GFP_NOFS); -map: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = em->bdev; - set_buffer_mapped(bh_result); - if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - - free_extent_map(em); - - return 0; -} - -struct btrfs_dio_private { - struct inode *inode; - u64 logical_offset; - u64 disk_bytenr; - u64 bytes; - u32 *csums; - void *private; - - /* number of bios pending for this dio */ - atomic_t pending_bios; - - /* IO errors */ - int errors; - - struct bio *orig_bio; -}; - -static void btrfs_endio_direct_read(struct bio *bio, int err) -{ - struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_vec *bvec = bio->bi_io_vec; - struct inode *inode = dip->inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 start; - u32 *private = dip->csums; - - start = dip->logical_offset; - do { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - struct page *page = bvec->bv_page; - char *kaddr; - u32 csum = ~(u32)0; - unsigned long flags; - - local_irq_save(flags); - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, - csum, bvec->bv_len); - btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr); - local_irq_restore(flags); - - flush_dcache_page(bvec->bv_page); - if (csum != *private) { - printk(KERN_ERR "btrfs csum failed ino %llu off" - " %llu csum %u private %u\n", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)start, - csum, *private); - err = -EIO; - } - } - - start += bvec->bv_len; - private++; - bvec++; - } while (bvec <= bvec_end); - - unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1); - bio->bi_private = dip->private; - - kfree(dip->csums); - kfree(dip); - - /* If we had a csum failure make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); -} - -static void btrfs_endio_direct_write(struct bio *bio, int err) -{ - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - struct btrfs_ordered_extent *ordered = NULL; - struct extent_state *cached_state = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - int ret; - - if (err) - goto out_done; -again: - ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes); - if (!ret) - goto out_test; - - BUG_ON(!ordered); - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - err = -ENOMEM; - goto out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret) - err = btrfs_update_inode_fallback(trans, root, inode); - goto out; - } - - lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, 0, - &cached_state); - - if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - ret = btrfs_mark_extent_written(trans, inode, - ordered->file_offset, - ordered->file_offset + - ordered->len); - if (ret) { - err = ret; - goto out_unlock; - } - } else { - ret = insert_reserved_file_extent(trans, inode, - ordered->file_offset, - ordered->start, - ordered->disk_len, - ordered->len, - ordered->len, - 0, 0, 0, - BTRFS_FILE_EXTENT_REG); - unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered->file_offset, ordered->len); - if (ret) { - err = ret; - WARN_ON(1); - goto out_unlock; - } - } - - add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); - ret = btrfs_ordered_update_i_size(inode, 0, ordered); - if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode_fallback(trans, root, inode); - ret = 0; -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, - ordered->file_offset + ordered->len - 1, - &cached_state, GFP_NOFS); -out: - btrfs_delalloc_release_metadata(inode, ordered->len); - btrfs_end_transaction(trans, root); - ordered_offset = ordered->file_offset + ordered->len; - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - -out_test: - /* - * our bio might span multiple ordered extents. If we haven't - * completed the accounting for the whole dio, go back and try again - */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; - goto again; - } -out_done: - bio->bi_private = dip->private; - - kfree(dip->csums); - kfree(dip); - - /* If we had an error make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - dio_end_io(bio, err); -} - -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 offset) -{ - int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; - ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); - BUG_ON(ret); /* -ENOMEM */ - return 0; -} - -static void btrfs_end_dio_bio(struct bio *bio, int err) -{ - struct btrfs_dio_private *dip = bio->bi_private; - - if (err) { - printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " - "sector %#Lx len %u err no %d\n", - (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_sector, bio->bi_size, err); - dip->errors = 1; - - /* - * before atomic variable goto zero, we must make sure - * dip->errors is perceived to be set. - */ - smp_mb__before_atomic_dec(); - } - - /* if there are more bios still pending for this dio, just exit */ - if (!atomic_dec_and_test(&dip->pending_bios)) - goto out; - - if (dip->errors) - bio_io_error(dip->orig_bio); - else { - set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); - bio_endio(dip->orig_bio, 0); - } -out: - bio_put(bio); -} - -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, - u64 first_sector, gfp_t gfp_flags) -{ - int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); -} - -static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - int rw, u64 file_offset, int skip_sum, - u32 *csums, int async_submit) -{ - int write = rw & REQ_WRITE; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto err; - - if (skip_sum) - goto map; - - if (write && async_submit) { - ret = btrfs_wq_submit_bio(root->fs_info, - inode, rw, bio, 0, 0, - file_offset, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); - goto err; - } else if (write) { - /* - * If we aren't doing async submit, calculate the csum of the - * bio now. - */ - ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); - if (ret) - goto err; - } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, bio, - file_offset, csums); - if (ret) - goto err; - } - -map: - ret = btrfs_map_bio(root, rw, bio, 0, async_submit); -err: - bio_put(bio); - return ret; -} - -static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, - int skip_sum) -{ - struct inode *inode = dip->inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct bio *bio; - struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec = orig_bio->bi_io_vec; - u64 start_sector = orig_bio->bi_sector; - u64 file_offset = dip->logical_offset; - u64 submit_len = 0; - u64 map_length; - int nr_pages = 0; - u32 *csums = dip->csums; - int ret = 0; - int async_submit = 0; - int write = rw & REQ_WRITE; - - map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(orig_bio); - return -EIO; - } - - if (map_length >= orig_bio->bi_size) { - bio = orig_bio; - goto submit; - } - - async_submit = 1; - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - atomic_inc(&dip->pending_bios); - - while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - if (unlikely(map_length < submit_len + bvec->bv_len || - bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len)) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the dip might get freed - * before we're done setting it up - */ - atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, rw, - file_offset, skip_sum, - csums, async_submit); - if (ret) { - bio_put(bio); - atomic_dec(&dip->pending_bios); - goto out_err; - } - - /* Write's use the ordered csums */ - if (!write && !skip_sum) - csums = csums + nr_pages; - start_sector += submit_len >> 9; - file_offset += submit_len; - - submit_len = 0; - nr_pages = 0; - - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, - start_sector, GFP_NOFS); - if (!bio) - goto out_err; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - - map_length = orig_bio->bi_size; - ret = btrfs_map_block(map_tree, READ, start_sector << 9, - &map_length, NULL, 0); - if (ret) { - bio_put(bio); - goto out_err; - } - } else { - submit_len += bvec->bv_len; - nr_pages ++; - bvec++; - } - } - -submit: - ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums, async_submit); - if (!ret) - return 0; - - bio_put(bio); -out_err: - dip->errors = 1; - /* - * before atomic variable goto zero, we must - * make sure dip->errors is perceived to be set. - */ - smp_mb__before_atomic_dec(); - if (atomic_dec_and_test(&dip->pending_bios)) - bio_io_error(dip->orig_bio); - - /* bio_end_io() will handle error, so we needn't return it */ - return 0; -} - -static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, - loff_t file_offset) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio_vec *bvec = bio->bi_io_vec; - int skip_sum; - int write = rw & REQ_WRITE; - int ret = 0; - - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - dip = kmalloc(sizeof(*dip), GFP_NOFS); - if (!dip) { - ret = -ENOMEM; - goto free_ordered; - } - dip->csums = NULL; - - /* Write's use the ordered csum stuff, so we don't need dip->csums */ - if (!write && !skip_sum) { - dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); - if (!dip->csums) { - kfree(dip); - ret = -ENOMEM; - goto free_ordered; - } - } - - dip->private = bio->bi_private; - dip->inode = inode; - dip->logical_offset = file_offset; - - dip->bytes = 0; - do { - dip->bytes += bvec->bv_len; - bvec++; - } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); - - dip->disk_bytenr = (u64)bio->bi_sector << 9; - bio->bi_private = dip; - dip->errors = 0; - dip->orig_bio = bio; - atomic_set(&dip->pending_bios, 0); - - if (write) - bio->bi_end_io = btrfs_endio_direct_write; - else - bio->bi_end_io = btrfs_endio_direct_read; - - ret = btrfs_submit_direct_hook(rw, dip, skip_sum); - if (!ret) - return; -free_ordered: - /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. - */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - } - bio_endio(bio, ret); -} - -static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - int seg; - int i; - size_t size; - unsigned long addr; - unsigned blocksize_mask = root->sectorsize - 1; - ssize_t retval = -EINVAL; - loff_t end = offset; - - if (offset & blocksize_mask) - goto out; - - /* Check the memory alignment. Blocks cannot straddle pages */ - for (seg = 0; seg < nr_segs; seg++) { - addr = (unsigned long)iov[seg].iov_base; - size = iov[seg].iov_len; - end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) - goto out; - - /* If this is a write we don't need to check anymore */ - if (rw & WRITE) - continue; - - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (i = seg + 1; i < nr_segs; i++) { - if (iov[seg].iov_base == iov[i].iov_base) - goto out; - } - } - retval = 0; -out: - return retval; -} -static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - u64 lockstart, lockend; - ssize_t ret; - int writing = rw & WRITE; - int write_bits = 0; - size_t count = iov_length(iov, nr_segs); - - if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, - offset, nr_segs)) { - return 0; - } - - lockstart = offset; - lockend = offset + count - 1; - - if (writing) { - ret = btrfs_delalloc_reserve_space(inode, count); - if (ret) - goto out; - } - - while (1) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state); - /* - * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered - * extents in this range. - */ - ordered = btrfs_lookup_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (!ordered) - break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state, GFP_NOFS); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - cond_resched(); - } - - /* - * we don't use btrfs_set_extent_delalloc because we don't want - * the dirty or uptodate bits - */ - if (writing) { - write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DELALLOC, NULL, &cached_state, - GFP_NOFS); - if (ret) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_LOCKED | write_bits, - 1, 0, &cached_state, GFP_NOFS); - goto out; - } - } - - free_extent_state(cached_state); - cached_state = NULL; - - ret = __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, 0); - - if (ret < 0 && ret != -EIOCBQUEUED) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, - offset + iov_length(iov, nr_segs) - 1, - EXTENT_LOCKED | write_bits, 1, 0, - &cached_state, GFP_NOFS); - } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { - /* - * We're falling back to buffered, unlock the section we didn't - * do IO on. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, - offset + iov_length(iov, nr_segs) - 1, - EXTENT_LOCKED | write_bits, 1, 0, - &cached_state, GFP_NOFS); - } -out: - free_extent_state(cached_state); - return ret; -} - -static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); -} - -int btrfs_readpage(struct file *file, struct page *page) -{ - struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent, 0); -} - -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - - - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); -} - -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct extent_io_tree *tree; - - tree = &BTRFS_I(mapping->host)->io_tree; - return extent_writepages(tree, mapping, btrfs_get_extent, wbc); -} - -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct extent_io_tree *tree; - tree = &BTRFS_I(mapping->host)->io_tree; - return extent_readpages(tree, mapping, pages, nr_pages, - btrfs_get_extent); -} -static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) -{ - struct extent_io_tree *tree; - struct extent_map_tree *map; - int ret; - - tree = &BTRFS_I(page->mapping->host)->io_tree; - map = &BTRFS_I(page->mapping->host)->extent_tree; - ret = try_release_extent_mapping(map, tree, page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } - return ret; -} - -static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) -{ - if (PageWriteback(page) || PageDirty(page)) - return 0; - return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); -} - -static void btrfs_invalidatepage(struct page *page, unsigned long offset) -{ - struct extent_io_tree *tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_CACHE_SIZE - 1; - - - /* - * we have the page locked, so new writeback can't start, - * and the dirty bit won't be cleared while we are here. - * - * Wait for IO on this page so that we can safely clear - * the PagePrivate2 bit and do ordered accounting - */ - wait_on_page_writeback(page); - - tree = &BTRFS_I(page->mapping->host)->io_tree; - if (offset) { - btrfs_releasepage(page, GFP_NOFS); - return; - } - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - ordered = btrfs_lookup_ordered_extent(page->mapping->host, - page_offset(page)); - if (ordered) { - /* - * IO on this page will never be started, so we need - * to account for any ordered extents now - */ - clear_extent_bit(tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, - &cached_state, GFP_NOFS); - /* - * whoever cleared the private bit is responsible - * for the finish_ordered_io - */ - if (TestClearPagePrivate2(page)) { - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); - } - btrfs_put_ordered_extent(ordered); - cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); - } - clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS); - __btrfs_releasepage(page, GFP_NOFS); - - ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - page_cache_release(page); - } -} - -/* - * btrfs_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the - * page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct inode *inode = fdentry(vma->vm_file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - char *kaddr; - unsigned long zero_start; - loff_t size; - int ret; - int reserved = 0; - u64 page_start; - u64 page_end; - - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - if (!ret) { - ret = btrfs_update_time(vma->vm_file); - reserved = 1; - } - if (ret) { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else /* -ENOSPC, -EIO, etc */ - ret = VM_FAULT_SIGBUS; - if (reserved) - goto out; - goto out_noreserve; - } - - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ -again: - lock_page(page); - size = i_size_read(inode); - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; - - if ((page->mapping != inode->i_mapping) || - (page_start >= size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - wait_on_page_writeback(page); - - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); - set_page_extent_mapped(page); - - /* - * we can't set the delalloc bits if there are pending ordered - * extents. Drop our locks and wait for them to finish - */ - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); - unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } - - /* - * XXX - page_mkwrite gets called every time the page is dirtied, even - * if it was already dirty, so for space accounting reasons we need to - * clear any delalloc bits for the range we are fixing to save. There - * is probably a better way to do this, but for now keep consistent with - * prepare_pages in the normal write path. - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - 0, 0, &cached_state, GFP_NOFS); - - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, - &cached_state); - if (ret) { - unlock_extent_cached(io_tree, page_start, page_end, - &cached_state, GFP_NOFS); - ret = VM_FAULT_SIGBUS; - goto out_unlock; - } - ret = 0; - - /* page is wholly or partially inside EOF */ - if (page_start + PAGE_CACHE_SIZE > size) - zero_start = size & ~PAGE_CACHE_MASK; - else - zero_start = PAGE_CACHE_SIZE; - - if (zero_start != PAGE_CACHE_SIZE) { - kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); - flush_dcache_page(page); - kunmap(page); - } - ClearPageChecked(page); - set_page_dirty(page); - SetPageUptodate(page); - - BTRFS_I(inode)->last_trans = root->fs_info->generation; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - - unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); - -out_unlock: - if (!ret) - return VM_FAULT_LOCKED; - unlock_page(page); -out: - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); -out_noreserve: - return ret; -} - -static int btrfs_truncate(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv; - int ret; - int err = 0; - struct btrfs_trans_handle *trans; - unsigned long nr; - u64 mask = root->sectorsize - 1; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); - - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); - if (ret) - return ret; - - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - - /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have - * 3 things going on here - * - * 1) We need to reserve space for our orphan item and the space to - * delete our orphan item. Lord knows we don't want to have a dangling - * orphan item because we didn't reserve space to remove it. - * - * 2) We need to reserve space to update our inode. - * - * 3) We need to have something to cache all the space that is going to - * be free'd up by the truncate operation, but also have some slack - * space reserved in case it uses space during the truncate (thank you - * very much snapshotting). - * - * And we need these to all be seperate. The fact is we can use alot of - * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it - * doesn't end up using space reserved for updating the inode or - * removing the orphan item. We also need to be able to stop the - * transaction and start a new one, which means we need to be able to - * update the inode several times, and we have no idea of knowing how - * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. - * Then there is the orphan item, which does indeed need to be held on - * to for the whole operation, and we need nobody to touch this reserved - * space except the orphan code. - * - * So that leaves us with - * - * 1) root->orphan_block_rsv - for the orphan deletion. - * 2) rsv - for the truncate reservation, which we will steal from the - * transaction reservation. - * 3) fs_info->trans_block_rsv - this will have 1 items worth left for - * updating the inode. - */ - rsv = btrfs_alloc_block_rsv(root); - if (!rsv) - return -ENOMEM; - rsv->size = min_size; - - /* - * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion - * 1 for updating the inode. - */ - trans = btrfs_start_transaction(root, 4); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; - } - - /* Migrate the slack space for the truncate to our reserve */ - ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); - BUG_ON(ret); - - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - - /* - * setattr is responsible for setting the ordered_data_close flag, - * but that is only tested during the last file release. That - * could happen well after the next commit, leaving a great big - * window where new writes may get lost if someone chooses to write - * to this file after truncating to zero - * - * The inode doesn't have any dirty data here, and so if we commit - * this is a noop. If someone immediately starts writing to the inode - * it is very likely we'll catch some of their writes in this - * transaction, and the commit will find this file on the ordered - * data list with good things to send down. - * - * This is a best effort solution, there is still a window where - * using truncate to replace the contents of the file will - * end up with a zero length file after a crash. - */ - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) - btrfs_add_ordered_operation(trans, root, inode); - - while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size); - if (ret) { - /* - * This can only happen with the original transaction we - * started above, every other time we shouldn't have a - * transaction started yet. - */ - if (ret == -EAGAIN) - goto end_trans; - err = ret; - break; - } - - if (!trans) { - /* Just need the 1 for updating the inode */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = err = PTR_ERR(trans); - trans = NULL; - break; - } - } - - trans->block_rsv = rsv; - - ret = btrfs_truncate_inode_items(trans, root, inode, - inode->i_size, - BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) { - err = ret; - break; - } - - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - err = ret; - break; - } -end_trans: - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - trans = NULL; - btrfs_btree_balance_dirty(root, nr); - } - - if (ret == 0 && inode->i_nlink > 0) { - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, inode); - if (ret) - err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); - } - - if (trans) { - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; - - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - } - -out: - btrfs_free_block_rsv(root, rsv); - - if (ret && !err) - err = ret; - - return err; -} - -/* - * create a new subvolume directory/inode (helper for the ioctl). - */ -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, u64 new_dirid) -{ - struct inode *inode; - int err; - u64 index = 0; - - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, - new_dirid, new_dirid, - S_IFDIR | (~current_umask() & S_IRWXUGO), - &index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - set_nlink(inode, 1); - btrfs_i_size_write(inode, 0); - - err = btrfs_update_inode(trans, new_root, inode); - - iput(inode); - return err; -} - -struct inode *btrfs_alloc_inode(struct super_block *sb) -{ - struct btrfs_inode *ei; - struct inode *inode; - - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); - if (!ei) - return NULL; - - ei->root = NULL; - ei->space_info = NULL; - ei->generation = 0; - ei->sequence = 0; - ei->last_trans = 0; - ei->last_sub_trans = 0; - ei->logged_trans = 0; - ei->delalloc_bytes = 0; - ei->disk_i_size = 0; - ei->flags = 0; - ei->csum_bytes = 0; - ei->index_cnt = (u64)-1; - ei->last_unlink_trans = 0; - - spin_lock_init(&ei->lock); - ei->outstanding_extents = 0; - ei->reserved_extents = 0; - - ei->ordered_data_close = 0; - ei->orphan_meta_reserved = 0; - ei->dummy_inode = 0; - ei->in_defrag = 0; - ei->delalloc_meta_reserved = 0; - ei->force_compress = BTRFS_COMPRESS_NONE; - - ei->delayed_node = NULL; - - inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(&ei->io_tree, &inode->i_data); - extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); - ei->io_tree.track_uptodate = 1; - ei->io_failure_tree.track_uptodate = 1; - mutex_init(&ei->log_mutex); - mutex_init(&ei->delalloc_mutex); - btrfs_ordered_inode_tree_init(&ei->ordered_tree); - INIT_LIST_HEAD(&ei->i_orphan); - INIT_LIST_HEAD(&ei->delalloc_inodes); - INIT_LIST_HEAD(&ei->ordered_operations); - RB_CLEAR_NODE(&ei->rb_node); - - return inode; -} - -static void btrfs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); -} - -void btrfs_destroy_inode(struct inode *inode) -{ - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - - WARN_ON(!list_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->reserved_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - - /* - * This can happen where we create an inode, but somebody else also - * created the same inode and we need to destroy the one we already - * created. - */ - if (!root) - goto free; - - /* - * Make sure we're properly removed from the ordered operation - * lists. - */ - smp_mb(); - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_extent_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_extent_lock); - } - - spin_lock(&root->orphan_lock); - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", - (unsigned long long)btrfs_ino(inode)); - list_del_init(&BTRFS_I(inode)->i_orphan); - } - spin_unlock(&root->orphan_lock); - - while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); - if (!ordered) - break; - else { - printk(KERN_ERR "btrfs found ordered " - "extent %llu %llu on inode cleanup\n", - (unsigned long long)ordered->file_offset, - (unsigned long long)ordered->len); - btrfs_remove_ordered_extent(inode, ordered); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - } - } - inode_tree_del(inode); - btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); -free: - btrfs_remove_delayed_node(inode); - call_rcu(&inode->i_rcu, btrfs_i_callback); -} - -int btrfs_drop_inode(struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - - if (btrfs_root_refs(&root->root_item) == 0 && - !btrfs_is_free_space_inode(root, inode)) - return 1; - else - return generic_drop_inode(inode); -} - -static void init_once(void *foo) -{ - struct btrfs_inode *ei = (struct btrfs_inode *) foo; - - inode_init_once(&ei->vfs_inode); -} - -void btrfs_destroy_cachep(void) -{ - if (btrfs_inode_cachep) - kmem_cache_destroy(btrfs_inode_cachep); - if (btrfs_trans_handle_cachep) - kmem_cache_destroy(btrfs_trans_handle_cachep); - if (btrfs_transaction_cachep) - kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_path_cachep) - kmem_cache_destroy(btrfs_path_cachep); - if (btrfs_free_space_cachep) - kmem_cache_destroy(btrfs_free_space_cachep); -} - -int btrfs_init_cachep(void) -{ - btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); - if (!btrfs_inode_cachep) - goto fail; - - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!btrfs_trans_handle_cachep) - goto fail; - - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!btrfs_transaction_cachep) - goto fail; - - btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!btrfs_path_cachep) - goto fail; - - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", - sizeof(struct btrfs_free_space), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_cachep) - goto fail; - - return 0; -fail: - btrfs_destroy_cachep(); - return -ENOMEM; -} - -static int btrfs_getattr(struct vfsmount *mnt, - struct dentry *dentry, struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - u32 blocksize = inode->i_sb->s_blocksize; - - generic_fillattr(inode, stat); - stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + - ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; - return 0; -} - -/* - * If a file is moved, it will inherit the cow and compression flags of the new - * directory. - */ -static void fixup_inode_flags(struct inode *dir, struct inode *inode) -{ - struct btrfs_inode *b_dir = BTRFS_I(dir); - struct btrfs_inode *b_inode = BTRFS_I(inode); - - if (b_dir->flags & BTRFS_INODE_NODATACOW) - b_inode->flags |= BTRFS_INODE_NODATACOW; - else - b_inode->flags &= ~BTRFS_INODE_NODATACOW; - - if (b_dir->flags & BTRFS_INODE_COMPRESS) - b_inode->flags |= BTRFS_INODE_COMPRESS; - else - b_inode->flags &= ~BTRFS_INODE_COMPRESS; -} - -static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(old_dir)->root; - struct btrfs_root *dest = BTRFS_I(new_dir)->root; - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; - struct timespec ctime = CURRENT_TIME; - u64 index = 0; - u64 root_objectid; - int ret; - u64 old_ino = btrfs_ino(old_inode); - - if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) - return -EPERM; - - /* we only allow rename subvolume link between subvolumes */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) - return -EXDEV; - - if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || - (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID)) - return -ENOTEMPTY; - - if (S_ISDIR(old_inode->i_mode) && new_inode && - new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) - return -ENOTEMPTY; - /* - * we're using rename to replace one file with another. - * and the replacement file is large. Start IO on it now so - * we don't add too much work to the end of the transaction - */ - if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(old_inode->i_mapping); - - /* close the racy window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&root->fs_info->subvol_sem); - /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - */ - trans = btrfs_start_transaction(root, 20); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } - - if (dest != root) - btrfs_record_root_in_trans(trans, dest); - - ret = btrfs_set_inode_index(new_dir, &index); - if (ret) - goto out_fail; - - if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - /* force full log commit if subvolume involved. */ - root->fs_info->last_trans_log_full_commit = trans->transid; - } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, - old_ino, - btrfs_ino(new_dir), index); - if (ret) - goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); - } - /* - * make sure the inode gets flushed if it is replacing - * something. - */ - if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) - btrfs_add_ordered_operation(trans, root, old_inode); - - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; - old_inode->i_ctime = ctime; - - if (old_dentry->d_parent != new_dentry->d_parent) - btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); - - if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; - ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, - old_dentry->d_name.name, - old_dentry->d_name.len); - } else { - ret = __btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); - if (!ret) - ret = btrfs_update_inode(trans, root, old_inode); - } - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out_fail; - } - - if (new_inode) { - new_inode->i_ctime = CURRENT_TIME; - if (unlikely(btrfs_ino(new_inode) == - BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - root_objectid = BTRFS_I(new_inode)->location.objectid; - ret = btrfs_unlink_subvol(trans, dest, new_dir, - root_objectid, - new_dentry->d_name.name, - new_dentry->d_name.len); - BUG_ON(new_inode->i_nlink == 0); - } else { - ret = btrfs_unlink_inode(trans, dest, new_dir, - new_dentry->d_inode, - new_dentry->d_name.name, - new_dentry->d_name.len); - } - if (!ret && new_inode->i_nlink == 0) { - ret = btrfs_orphan_add(trans, new_dentry->d_inode); - BUG_ON(ret); - } - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out_fail; - } - } - - fixup_inode_flags(new_dir, old_inode); - - ret = btrfs_add_link(trans, new_dir, old_inode, - new_dentry->d_name.name, - new_dentry->d_name.len, 0, index); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out_fail; - } - - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - struct dentry *parent = new_dentry->d_parent; - btrfs_log_new_name(trans, old_inode, old_dir, parent); - btrfs_end_log_trans(root); - } -out_fail: - btrfs_end_transaction(trans, root); -out_notrans: - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&root->fs_info->subvol_sem); - - return ret; -} - -/* - * some fairly slow code that needs optimization. This walks the list - * of all the inodes with pending delalloc and forces them to disk. - */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) -{ - struct list_head *head = &root->fs_info->delalloc_inodes; - struct btrfs_inode *binode; - struct inode *inode; - - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, - delalloc_inodes); - inode = igrab(&binode->vfs_inode); - if (!inode) - list_del_init(&binode->delalloc_inodes); - spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - filemap_flush(inode->i_mapping); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); - } - cond_resched(); - spin_lock(&root->fs_info->delalloc_lock); - } - spin_unlock(&root->fs_info->delalloc_lock); - - /* the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&root->fs_info->async_submit_draining); - while (atomic_read(&root->fs_info->nr_async_submits) || - atomic_read(&root->fs_info->async_delalloc_pages)) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->nr_async_submits) == 0 && - atomic_read(&root->fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&root->fs_info->async_submit_draining); - return 0; -} - -static int btrfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct inode *inode = NULL; - int err; - int drop_inode = 0; - u64 objectid; - u64 index = 0 ; - int name_len; - int datasize; - unsigned long ptr; - struct btrfs_file_extent_item *ei; - struct extent_buffer *leaf; - unsigned long nr = 0; - - name_len = strlen(symname) + 1; - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) - return -ENAMETOOLONG; - - /* - * 2 items for inode item and ref - * 2 items for dir items - * 1 item for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_find_free_ino(root, &objectid); - if (err) - goto out_unlock; - - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(dir), objectid, - S_IFLNK|S_IRWXUGO, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out_unlock; - } - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - if (drop_inode) - goto out_unlock; - - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - drop_inode = 1; - goto out_unlock; - } - key.objectid = btrfs_ino(inode); - key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - datasize = btrfs_file_extent_calc_inline_size(name_len); - err = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (err) { - drop_inode = 1; - btrfs_free_path(path); - goto out_unlock; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, ei, trans->transid); - btrfs_set_file_extent_type(leaf, ei, - BTRFS_FILE_EXTENT_INLINE); - btrfs_set_file_extent_encryption(leaf, ei, 0); - btrfs_set_file_extent_compression(leaf, ei, 0); - btrfs_set_file_extent_other_encoding(leaf, ei, 0); - btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); - - ptr = btrfs_file_extent_inline_start(ei); - write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(leaf); - btrfs_free_path(path); - - inode->i_op = &btrfs_symlink_inode_operations; - inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode_set_bytes(inode, name_len); - btrfs_i_size_write(inode, name_len - 1); - err = btrfs_update_inode(trans, root, inode); - if (err) - drop_inode = 1; - -out_unlock: - if (!err) - d_instantiate(dentry, inode); - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - if (drop_inode) { - inode_dec_link_count(inode); - iput(inode); - } - btrfs_btree_balance_dirty(root, nr); - return err; -} - -static int __btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint, - struct btrfs_trans_handle *trans) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_key ins; - u64 cur_offset = start; - u64 i_size; - int ret = 0; - bool own_trans = true; - - if (trans) - own_trans = false; - while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - - ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, - 0, *alloc_hint, &ins, 1); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans, root); - break; - } - - ret = insert_reserved_file_extent(trans, inode, - cur_offset, ins.objectid, - ins.offset, ins.offset, - ins.offset, 0, 0, 0, - BTRFS_FILE_EXTENT_PREALLOC); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - if (own_trans) - btrfs_end_transaction(trans, root); - break; - } - btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + ins.offset -1, 0); - - num_bytes -= ins.offset; - cur_offset += ins.offset; - *alloc_hint = ins.objectid + ins.offset; - - inode->i_ctime = CURRENT_TIME; - BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (actual_len > inode->i_size) && - (cur_offset > inode->i_size)) { - if (cur_offset > actual_len) - i_size = actual_len; - else - i_size = cur_offset; - i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); - } - - ret = btrfs_update_inode(trans, root, inode); - - if (ret) { - btrfs_abort_transaction(trans, root, ret); - if (own_trans) - btrfs_end_transaction(trans, root); - break; - } - - if (own_trans) - btrfs_end_transaction(trans, root); - } - return ret; -} - -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint) -{ - return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, - min_size, actual_len, alloc_hint, - NULL); -} - -int btrfs_prealloc_file_range_trans(struct inode *inode, - struct btrfs_trans_handle *trans, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint) -{ - return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, - min_size, actual_len, alloc_hint, trans); -} - -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} - -static int btrfs_permission(struct inode *inode, int mask) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - umode_t mode = inode->i_mode; - - if (mask & MAY_WRITE && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { - if (btrfs_root_readonly(root)) - return -EROFS; - if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) - return -EACCES; - } - return generic_permission(inode, mask); -} - -static const struct inode_operations btrfs_dir_inode_operations = { - .getattr = btrfs_getattr, - .lookup = btrfs_lookup, - .create = btrfs_create, - .unlink = btrfs_unlink, - .link = btrfs_link, - .mkdir = btrfs_mkdir, - .rmdir = btrfs_rmdir, - .rename = btrfs_rename, - .symlink = btrfs_symlink, - .setattr = btrfs_setattr, - .mknod = btrfs_mknod, - .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, - .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, - .permission = btrfs_permission, - .get_acl = btrfs_get_acl, -}; -static const struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, - .permission = btrfs_permission, - .get_acl = btrfs_get_acl, -}; - -static const struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .readdir = btrfs_real_readdir, - .unlocked_ioctl = btrfs_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, -#endif - .release = btrfs_release_file, - .fsync = btrfs_sync_file, -}; - -static struct extent_io_ops btrfs_extent_io_ops = { - .fill_delalloc = run_delalloc_range, - .submit_bio_hook = btrfs_submit_bio_hook, - .merge_bio_hook = btrfs_merge_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .writepage_end_io_hook = btrfs_writepage_end_io_hook, - .writepage_start_hook = btrfs_writepage_start_hook, - .set_bit_hook = btrfs_set_bit_hook, - .clear_bit_hook = btrfs_clear_bit_hook, - .merge_extent_hook = btrfs_merge_extent_hook, - .split_extent_hook = btrfs_split_extent_hook, -}; - -/* - * btrfs doesn't support the bmap operation because swapfiles - * use bmap to make a mapping of extents in the file. They assume - * these extents won't change over the life of the file and they - * use the bmap result to do IO directly to the drive. - * - * the btrfs bmap call would return logical addresses that aren't - * suitable for IO and they also will change frequently as COW - * operations happen. So, swapfile + btrfs == corruption. - * - * For now we're avoiding this by dropping bmap. - */ -static const struct address_space_operations btrfs_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .writepages = btrfs_writepages, - .readpages = btrfs_readpages, - .direct_IO = btrfs_direct_IO, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, - .set_page_dirty = btrfs_set_page_dirty, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations btrfs_symlink_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, - .invalidatepage = btrfs_invalidatepage, - .releasepage = btrfs_releasepage, -}; - -static const struct inode_operations btrfs_file_inode_operations = { - .getattr = btrfs_getattr, - .setattr = btrfs_setattr, - .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, - .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, - .permission = btrfs_permission, - .fiemap = btrfs_fiemap, - .get_acl = btrfs_get_acl, -}; -static const struct inode_operations btrfs_special_inode_operations = { - .getattr = btrfs_getattr, - .setattr = btrfs_setattr, - .permission = btrfs_permission, - .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, - .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, -}; -static const struct inode_operations btrfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .getattr = btrfs_getattr, - .setattr = btrfs_setattr, - .permission = btrfs_permission, - .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, - .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, -}; - -const struct dentry_operations btrfs_dentry_operations = { - .d_delete = btrfs_dentry_delete, - .d_release = btrfs_dentry_release, -}; diff --git a/ANDROID_3.4.5/fs/btrfs/ioctl.c b/ANDROID_3.4.5/fs/btrfs/ioctl.c deleted file mode 100644 index 14f8e1fa..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ioctl.c +++ /dev/null @@ -1,3430 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/kernel.h> -#include <linux/bio.h> -#include <linux/buffer_head.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/fsnotify.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/backing-dev.h> -#include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/namei.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/statfs.h> -#include <linux/compat.h> -#include <linux/bit_spinlock.h> -#include <linux/security.h> -#include <linux/xattr.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include "compat.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "ioctl.h" -#include "print-tree.h" -#include "volumes.h" -#include "locking.h" -#include "inode-map.h" -#include "backref.h" - -/* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & ~FS_DIRSYNC_FL; - else - return flags & (FS_NODUMP_FL | FS_NOATIME_FL); -} - -/* - * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. - */ -static unsigned int btrfs_flags_to_ioctl(unsigned int flags) -{ - unsigned int iflags = 0; - - if (flags & BTRFS_INODE_SYNC) - iflags |= FS_SYNC_FL; - if (flags & BTRFS_INODE_IMMUTABLE) - iflags |= FS_IMMUTABLE_FL; - if (flags & BTRFS_INODE_APPEND) - iflags |= FS_APPEND_FL; - if (flags & BTRFS_INODE_NODUMP) - iflags |= FS_NODUMP_FL; - if (flags & BTRFS_INODE_NOATIME) - iflags |= FS_NOATIME_FL; - if (flags & BTRFS_INODE_DIRSYNC) - iflags |= FS_DIRSYNC_FL; - if (flags & BTRFS_INODE_NODATACOW) - iflags |= FS_NOCOW_FL; - - if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) - iflags |= FS_COMPR_FL; - else if (flags & BTRFS_INODE_NOCOMPRESS) - iflags |= FS_NOCOMP_FL; - - return iflags; -} - -/* - * Update inode->i_flags based on the btrfs internal flags. - */ -void btrfs_update_iflags(struct inode *inode) -{ - struct btrfs_inode *ip = BTRFS_I(inode); - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - - if (ip->flags & BTRFS_INODE_SYNC) - inode->i_flags |= S_SYNC; - if (ip->flags & BTRFS_INODE_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - if (ip->flags & BTRFS_INODE_APPEND) - inode->i_flags |= S_APPEND; - if (ip->flags & BTRFS_INODE_NOATIME) - inode->i_flags |= S_NOATIME; - if (ip->flags & BTRFS_INODE_DIRSYNC) - inode->i_flags |= S_DIRSYNC; -} - -/* - * Inherit flags from the parent inode. - * - * Currently only the compression flags and the cow flags are inherited. - */ -void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) -{ - unsigned int flags; - - if (!dir) - return; - - flags = BTRFS_I(dir)->flags; - - if (flags & BTRFS_INODE_NOCOMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - } else if (flags & BTRFS_INODE_COMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - } - - if (flags & BTRFS_INODE_NODATACOW) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - - btrfs_update_iflags(inode); -} - -static int btrfs_ioctl_getflags(struct file *file, void __user *arg) -{ - struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); - unsigned int flags = btrfs_flags_to_ioctl(ip->flags); - - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - -static int check_flags(unsigned int flags) -{ - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DIRSYNC_FL | \ - FS_NOCOMP_FL | FS_COMPR_FL | - FS_NOCOW_FL)) - return -EOPNOTSUPP; - - if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) - return -EINVAL; - - return 0; -} - -static int btrfs_ioctl_setflags(struct file *file, void __user *arg) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct btrfs_inode *ip = BTRFS_I(inode); - struct btrfs_root *root = ip->root; - struct btrfs_trans_handle *trans; - unsigned int flags, oldflags; - int ret; - u64 ip_oldflags; - unsigned int i_oldflags; - - if (btrfs_root_readonly(root)) - return -EROFS; - - if (copy_from_user(&flags, arg, sizeof(flags))) - return -EFAULT; - - ret = check_flags(flags); - if (ret) - return ret; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - mutex_lock(&inode->i_mutex); - - ip_oldflags = ip->flags; - i_oldflags = inode->i_flags; - - flags = btrfs_mask_flags(inode->i_mode, flags); - oldflags = btrfs_flags_to_ioctl(ip->flags); - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; - goto out_unlock; - } - } - - ret = mnt_want_write_file(file); - if (ret) - goto out_unlock; - - if (flags & FS_SYNC_FL) - ip->flags |= BTRFS_INODE_SYNC; - else - ip->flags &= ~BTRFS_INODE_SYNC; - if (flags & FS_IMMUTABLE_FL) - ip->flags |= BTRFS_INODE_IMMUTABLE; - else - ip->flags &= ~BTRFS_INODE_IMMUTABLE; - if (flags & FS_APPEND_FL) - ip->flags |= BTRFS_INODE_APPEND; - else - ip->flags &= ~BTRFS_INODE_APPEND; - if (flags & FS_NODUMP_FL) - ip->flags |= BTRFS_INODE_NODUMP; - else - ip->flags &= ~BTRFS_INODE_NODUMP; - if (flags & FS_NOATIME_FL) - ip->flags |= BTRFS_INODE_NOATIME; - else - ip->flags &= ~BTRFS_INODE_NOATIME; - if (flags & FS_DIRSYNC_FL) - ip->flags |= BTRFS_INODE_DIRSYNC; - else - ip->flags &= ~BTRFS_INODE_DIRSYNC; - if (flags & FS_NOCOW_FL) - ip->flags |= BTRFS_INODE_NODATACOW; - else - ip->flags &= ~BTRFS_INODE_NODATACOW; - - /* - * The COMPRESS flag can only be changed by users, while the NOCOMPRESS - * flag may be changed automatically if compression code won't make - * things smaller. - */ - if (flags & FS_NOCOMP_FL) { - ip->flags &= ~BTRFS_INODE_COMPRESS; - ip->flags |= BTRFS_INODE_NOCOMPRESS; - } else if (flags & FS_COMPR_FL) { - ip->flags |= BTRFS_INODE_COMPRESS; - ip->flags &= ~BTRFS_INODE_NOCOMPRESS; - } else { - ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); - } - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_drop; - } - - btrfs_update_iflags(inode); - inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, inode); - - btrfs_end_transaction(trans, root); - out_drop: - if (ret) { - ip->flags = ip_oldflags; - inode->i_flags = i_oldflags; - } - - mnt_drop_write_file(file); - out_unlock: - mutex_unlock(&inode->i_mutex); - return ret; -} - -static int btrfs_ioctl_getversion(struct file *file, int __user *arg) -{ - struct inode *inode = file->f_path.dentry->d_inode; - - return put_user(inode->i_generation, arg); -} - -static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb); - struct btrfs_device *device; - struct request_queue *q; - struct fstrim_range range; - u64 minlen = ULLONG_MAX; - u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - rcu_read_lock(); - list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, - dev_list) { - if (!device->bdev) - continue; - q = bdev_get_queue(device->bdev); - if (blk_queue_discard(q)) { - num_devices++; - minlen = min((u64)q->limits.discard_granularity, - minlen); - } - } - rcu_read_unlock(); - - if (!num_devices) - return -EOPNOTSUPP; - if (copy_from_user(&range, arg, sizeof(range))) - return -EFAULT; - if (range.start > total_bytes) - return -EINVAL; - - range.len = min(range.len, total_bytes - range.start); - range.minlen = max(range.minlen, minlen); - ret = btrfs_trim_fs(fs_info->tree_root, &range); - if (ret < 0) - return ret; - - if (copy_to_user(arg, &range, sizeof(range))) - return -EFAULT; - - return 0; -} - -static noinline int create_subvol(struct btrfs_root *root, - struct dentry *dentry, - char *name, int namelen, - u64 *async_transid) -{ - struct btrfs_trans_handle *trans; - struct btrfs_key key; - struct btrfs_root_item root_item; - struct btrfs_inode_item *inode_item; - struct extent_buffer *leaf; - struct btrfs_root *new_root; - struct dentry *parent = dentry->d_parent; - struct inode *dir; - int ret; - int err; - u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; - u64 index = 0; - - ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); - if (ret) - return ret; - - dir = parent->d_inode; - - /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items - */ - trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0, 0); - if (IS_ERR(leaf)) { - ret = PTR_ERR(leaf); - goto fail; - } - - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); - btrfs_set_header_bytenr(leaf, leaf->start); - btrfs_set_header_generation(leaf, trans->transid); - btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(leaf, objectid); - - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), - BTRFS_FSID_SIZE); - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), - BTRFS_UUID_SIZE); - btrfs_mark_buffer_dirty(leaf); - - inode_item = &root_item.inode; - memset(inode_item, 0, sizeof(*inode_item)); - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - - root_item.flags = 0; - root_item.byte_limit = 0; - inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); - - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); - - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); - root_item.drop_level = 0; - - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; - - btrfs_set_root_dirid(&root_item, new_dirid); - - key.objectid = objectid; - key.offset = 0; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); - if (ret) - goto fail; - - key.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); - ret = PTR_ERR(new_root); - goto fail; - } - - btrfs_record_root_in_trans(trans, new_root); - - ret = btrfs_create_subvol_root(trans, new_root, new_dirid); - if (ret) { - /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(dir, &index); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_insert_dir_item(trans, root, - name, namelen, dir, &key, - BTRFS_FT_DIR, index); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - btrfs_i_size_write(dir, dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); - - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - objectid, root->root_key.objectid, - btrfs_ino(dir), index, name, namelen); - - BUG_ON(ret); - - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); -fail: - if (async_transid) { - *async_transid = trans->transid; - err = btrfs_commit_transaction_async(trans, root, 1); - } else { - err = btrfs_commit_transaction(trans, root); - } - if (err && !ret) - ret = err; - return ret; -} - -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid, - bool readonly) -{ - struct inode *inode; - struct btrfs_pending_snapshot *pending_snapshot; - struct btrfs_trans_handle *trans; - int ret; - - if (!root->ref_cows) - return -EINVAL; - - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) - return -ENOMEM; - - btrfs_init_block_rsv(&pending_snapshot->block_rsv); - pending_snapshot->dentry = dentry; - pending_snapshot->root = root; - pending_snapshot->readonly = readonly; - - trans = btrfs_start_transaction(root->fs_info->extent_root, 5); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto fail; - } - - ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); - BUG_ON(ret); - - spin_lock(&root->fs_info->trans_lock); - list_add(&pending_snapshot->list, - &trans->transaction->pending_snapshots); - spin_unlock(&root->fs_info->trans_lock); - if (async_transid) { - *async_transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, - root->fs_info->extent_root, 1); - } else { - ret = btrfs_commit_transaction(trans, - root->fs_info->extent_root); - } - BUG_ON(ret); - - ret = pending_snapshot->error; - if (ret) - goto fail; - - ret = btrfs_orphan_cleanup(pending_snapshot->snap); - if (ret) - goto fail; - - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto fail; - } - BUG_ON(!inode); - d_instantiate(dentry, inode); - ret = 0; -fail: - kfree(pending_snapshot); - return ret; -} - -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - uid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (inode->i_uid == fsuid) - return 0; - if (dir->i_uid == fsuid) - return 0; - return !capable(CAP_FOWNER); -} - -/* copy of may_delete in fs/namei.c() - * Check whether we can remove a link victim from directory dir, check - * whether the type of victim is right. - * 1. We can't do it if dir is read-only (done in permission()) - * 2. We should have write and exec permissions on dir - * 3. We can't remove anything from append-only dir - * 4. We can't do anything with immutable dir (done in permission()) - * 5. If the sticky bit on dir is set we should either - * a. be owner of dir, or - * b. be owner of victim, or - * c. have CAP_FOWNER capability - * 6. If the victim is append-only or immutable we can't do antyhing with - * links pointing to it. - * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. - * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. - * 9. We can't remove a root or mountpoint. - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by - * nfs_async_unlink(). - */ - -static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) -{ - int error; - - if (!victim->d_inode) - return -ENOENT; - - BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(victim, dir); - - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); - if (error) - return error; - if (IS_APPEND(dir)) - return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) - return -EPERM; - if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) - return -ENOTDIR; - if (IS_ROOT(victim)) - return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) - return -EISDIR; - if (IS_DEADDIR(dir)) - return -ENOENT; - if (victim->d_flags & DCACHE_NFSFS_RENAMED) - return -EBUSY; - return 0; -} - -/* copy of may_create in fs/namei.c() */ -static inline int btrfs_may_create(struct inode *dir, struct dentry *child) -{ - if (child->d_inode) - return -EEXIST; - if (IS_DEADDIR(dir)) - return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); -} - -/* - * Create a new subvolume below @parent. This is largely modeled after - * sys_mkdirat and vfs_mkdir, but we only do a single component lookup - * inside this filesystem so it's quite a bit simpler. - */ -static noinline int btrfs_mksubvol(struct path *parent, - char *name, int namelen, - struct btrfs_root *snap_src, - u64 *async_transid, bool readonly) -{ - struct inode *dir = parent->dentry->d_inode; - struct dentry *dentry; - int error; - - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - - dentry = lookup_one_len(name, parent->dentry, namelen); - error = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_unlock; - - error = -EEXIST; - if (dentry->d_inode) - goto out_dput; - - error = mnt_want_write(parent->mnt); - if (error) - goto out_dput; - - error = btrfs_may_create(dir, dentry); - if (error) - goto out_drop_write; - - down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); - - if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) - goto out_up_read; - - if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen, async_transid, readonly); - } else { - error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen, async_transid); - } - if (!error) - fsnotify_mkdir(dir, dentry); -out_up_read: - up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); -out_drop_write: - mnt_drop_write(parent->mnt); -out_dput: - dput(dentry); -out_unlock: - mutex_unlock(&dir->i_mutex); - return error; -} - -/* - * When we're defragging a range, we don't want to kick it off again - * if it is really just waiting for delalloc to send it down. - * If we find a nice big extent or delalloc range for the bytes in the - * file you want to defrag, we return 0 to let you know to skip this - * part of the file - */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) -{ - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 end; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - read_unlock(&em_tree->lock); - - if (em) { - end = extent_map_end(em); - free_extent_map(em); - if (end - offset > thresh) - return 0; - } - /* if we already have a nice delalloc here, just stop */ - thresh /= 2; - end = count_range_bits(io_tree, &offset, offset + thresh, - thresh, EXTENT_DELALLOC, 1); - if (end >= thresh) - return 0; - return 1; -} - -/* - * helper function to walk through a file and find extents - * newer than a specific transid, and smaller than thresh. - * - * This is used by the defragging code to find new and small - * extents - */ -static int find_new_extents(struct btrfs_root *root, - struct inode *inode, u64 newer_than, - u64 *off, int thresh) -{ - struct btrfs_path *path; - struct btrfs_key min_key; - struct btrfs_key max_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - int type; - int ret; - u64 ino = btrfs_ino(inode); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - min_key.objectid = ino; - min_key.type = BTRFS_EXTENT_DATA_KEY; - min_key.offset = *off; - - max_key.objectid = ino; - max_key.type = (u8)-1; - max_key.offset = (u64)-1; - - path->keep_locks = 1; - - while(1) { - ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, newer_than); - if (ret != 0) - goto none; - if (min_key.objectid != ino) - goto none; - if (min_key.type != BTRFS_EXTENT_DATA_KEY) - goto none; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG && - btrfs_file_extent_num_bytes(leaf, extent) < thresh && - check_defrag_in_cache(inode, min_key.offset, thresh)) { - *off = min_key.offset; - btrfs_free_path(path); - return 0; - } - - if (min_key.offset == (u64)-1) - goto none; - - min_key.offset++; - btrfs_release_path(path); - } -none: - btrfs_free_path(path); - return -ENOENT; -} - -/* - * Validaty check of prev em and next em: - * 1) no prev/next em - * 2) prev/next em is an hole/inline extent - */ -static int check_adjacent_extents(struct inode *inode, struct extent_map *em) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *prev = NULL, *next = NULL; - int ret = 0; - - read_lock(&em_tree->lock); - prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); - next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); - read_unlock(&em_tree->lock); - - if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && - (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) - ret = 1; - free_extent_map(prev); - free_extent_map(next); - - return ret; -} - -static int should_defrag_range(struct inode *inode, u64 start, u64 len, - int thresh, u64 *last_len, u64 *skip, - u64 *defrag_end) -{ - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - int ret = 1; - - /* - * make sure that once we start defragging an extent, we keep on - * defragging it - */ - if (start < *defrag_end) - return 1; - - *skip = 0; - - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); - - if (!em) { - /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); - - if (IS_ERR(em)) - return 0; - } - - /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - ret = 0; - goto out; - } - - /* If we have nothing to merge with us, just skip. */ - if (check_adjacent_extents(inode, em)) { - ret = 0; - goto out; - } - - /* - * we hit a real extent, if it is big don't bother defragging it again - */ - if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) - ret = 0; - -out: - /* - * last_len ends up being a counter of how many bytes we've defragged. - * every time we choose not to defrag an extent, we reset *last_len - * so that the next tiny extent will force a defrag. - * - * The end result of this is that tiny extents before a single big - * extent will force at least part of that big extent to be defragged. - */ - if (ret) { - *defrag_end = extent_map_end(em); - } else { - *last_len = 0; - *skip = extent_map_end(em); - *defrag_end = 0; - } - - free_extent_map(em); - return ret; -} - -/* - * it doesn't do much good to defrag one or two pages - * at a time. This pulls in a nice chunk of pages - * to COW and defrag. - * - * It also makes sure the delalloc code has enough - * dirty data to avoid making new small extents as part - * of the defrag - * - * It's a good idea to start RA on this range - * before calling this. - */ -static int cluster_pages_for_defrag(struct inode *inode, - struct page **pages, - unsigned long start_index, - int num_pages) -{ - unsigned long file_end; - u64 isize = i_size_read(inode); - u64 page_start; - u64 page_end; - u64 page_cnt; - int ret; - int i; - int i_done; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_io_tree *tree; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - - file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || start_index > file_end) - return 0; - - page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - - ret = btrfs_delalloc_reserve_space(inode, - page_cnt << PAGE_CACHE_SHIFT); - if (ret) - return ret; - i_done = 0; - tree = &BTRFS_I(inode)->io_tree; - - /* step one, lock all the pages */ - for (i = 0; i < page_cnt; i++) { - struct page *page; -again: - page = find_or_create_page(inode->i_mapping, - start_index + i, mask); - if (!page) - break; - - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; - while (1) { - lock_extent(tree, page_start, page_end); - ordered = btrfs_lookup_ordered_extent(inode, - page_start); - unlock_extent(tree, page_start, page_end); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * we unlocked the page above, so we need check if - * it was released or not. - */ - if (page->mapping != inode->i_mapping) { - unlock_page(page); - page_cache_release(page); - goto again; - } - } - - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - ret = -EIO; - break; - } - } - - if (page->mapping != inode->i_mapping) { - unlock_page(page); - page_cache_release(page); - goto again; - } - - pages[i] = page; - i_done++; - } - if (!i_done || ret) - goto out; - - if (!(inode->i_sb->s_flags & MS_ACTIVE)) - goto out; - - /* - * so now we have a nice long stream of locked - * and up to date pages, lets wait on them - */ - for (i = 0; i < i_done; i++) - wait_on_page_writeback(pages[i]); - - page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; - - lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state); - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, - GFP_NOFS); - - if (i_done != page_cnt) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); - } - - - btrfs_set_extent_delalloc(inode, page_start, page_end - 1, - &cached_state); - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state, - GFP_NOFS); - - for (i = 0; i < i_done; i++) { - clear_page_dirty_for_io(pages[i]); - ClearPageChecked(pages[i]); - set_page_extent_mapped(pages[i]); - set_page_dirty(pages[i]); - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - return i_done; -out: - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); - return ret; - -} - -int btrfs_defrag_file(struct inode *inode, struct file *file, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_super_block *disk_super; - struct file_ra_state *ra = NULL; - unsigned long last_index; - u64 isize = i_size_read(inode); - u64 features; - u64 last_len = 0; - u64 skip = 0; - u64 defrag_end = 0; - u64 newer_off = range->start; - unsigned long i; - unsigned long ra_index = 0; - int ret; - int defrag_count = 0; - int compress_type = BTRFS_COMPRESS_ZLIB; - int extent_thresh = range->extent_thresh; - int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; - int cluster = max_cluster; - u64 new_align = ~((u64)128 * 1024 - 1); - struct page **pages = NULL; - - if (extent_thresh == 0) - extent_thresh = 256 * 1024; - - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - if (range->compress_type > BTRFS_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; - } - - if (isize == 0) - return 0; - - /* - * if we were not given a file, allocate a readahead - * context - */ - if (!file) { - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return -ENOMEM; - file_ra_state_init(ra, inode->i_mapping); - } else { - ra = &file->f_ra; - } - - pages = kmalloc(sizeof(struct page *) * max_cluster, - GFP_NOFS); - if (!pages) { - ret = -ENOMEM; - goto out_ra; - } - - /* find the last page to defrag */ - if (range->start + range->len > range->start) { - last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_CACHE_SHIFT; - } else { - last_index = (isize - 1) >> PAGE_CACHE_SHIFT; - } - - if (newer_than) { - ret = find_new_extents(root, inode, newer_than, - &newer_off, 64 * 1024); - if (!ret) { - range->start = newer_off; - /* - * we always align our defrag to help keep - * the extents in the file evenly spaced - */ - i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - } else - goto out_ra; - } else { - i = range->start >> PAGE_CACHE_SHIFT; - } - if (!max_to_defrag) - max_to_defrag = last_index + 1; - - /* - * make writeback starts from i, so the defrag range can be - * written sequentially. - */ - if (i < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = i; - - while (i <= last_index && defrag_count < max_to_defrag && - (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) { - /* - * make sure we stop running if someone unmounts - * the FS - */ - if (!(inode->i_sb->s_flags & MS_ACTIVE)) - break; - - if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, extent_thresh, - &last_len, &skip, &defrag_end)) { - unsigned long next; - /* - * the should_defrag function tells us how much to skip - * bump our counter by the suggested amount - */ - next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - i = max(i + 1, next); - continue; - } - - if (!newer_than) { - cluster = (PAGE_CACHE_ALIGN(defrag_end) >> - PAGE_CACHE_SHIFT) - i; - cluster = min(cluster, max_cluster); - } else { - cluster = max_cluster; - } - - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = compress_type; - - if (i + cluster > ra_index) { - ra_index = max(i, ra_index); - btrfs_force_ra(inode->i_mapping, ra, file, ra_index, - cluster); - ra_index += max_cluster; - } - - mutex_lock(&inode->i_mutex); - ret = cluster_pages_for_defrag(inode, pages, i, cluster); - if (ret < 0) { - mutex_unlock(&inode->i_mutex); - goto out_ra; - } - - defrag_count += ret; - balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); - mutex_unlock(&inode->i_mutex); - - if (newer_than) { - if (newer_off == (u64)-1) - break; - - if (ret > 0) - i += ret; - - newer_off = max(newer_off + 1, - (u64)i << PAGE_CACHE_SHIFT); - - ret = find_new_extents(root, inode, - newer_than, &newer_off, - 64 * 1024); - if (!ret) { - range->start = newer_off; - i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - } else { - break; - } - } else { - if (ret > 0) { - i += ret; - last_len += ret << PAGE_CACHE_SHIFT; - } else { - i++; - last_len = 0; - } - } - } - - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) - filemap_flush(inode->i_mapping); - - if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { - /* the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&root->fs_info->async_submit_draining); - while (atomic_read(&root->fs_info->nr_async_submits) || - atomic_read(&root->fs_info->async_delalloc_pages)) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->nr_async_submits) == 0 && - atomic_read(&root->fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&root->fs_info->async_submit_draining); - - mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); - } - - disk_super = root->fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (range->compress_type == BTRFS_COMPRESS_LZO) { - features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; - btrfs_set_super_incompat_flags(disk_super, features); - } - - ret = defrag_count; - -out_ra: - if (!file) - kfree(ra); - kfree(pages); - return ret; -} - -static noinline int btrfs_ioctl_resize(struct btrfs_root *root, - void __user *arg) -{ - u64 new_size; - u64 old_size; - u64 devid = 1; - struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; - struct btrfs_device *device = NULL; - char *sizestr; - char *devstr = NULL; - int ret = 0; - int mod = 0; - - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; - } - - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - - sizestr = vol_args->name; - devstr = strchr(sizestr, ':'); - if (devstr) { - char *end; - sizestr = devstr + 1; - *devstr = '\0'; - devstr = vol_args->name; - devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "btrfs: resizing devid %llu\n", - (unsigned long long)devid); - } - device = btrfs_find_device(root, devid, NULL, NULL); - if (!device) { - printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", - (unsigned long long)devid); - ret = -EINVAL; - goto out_free; - } - if (!strcmp(sizestr, "max")) - new_size = device->bdev->bd_inode->i_size; - else { - if (sizestr[0] == '-') { - mod = -1; - sizestr++; - } else if (sizestr[0] == '+') { - mod = 1; - sizestr++; - } - new_size = memparse(sizestr, NULL); - if (new_size == 0) { - ret = -EINVAL; - goto out_free; - } - } - - old_size = device->total_bytes; - - if (mod < 0) { - if (new_size > old_size) { - ret = -EINVAL; - goto out_free; - } - new_size = old_size - new_size; - } else if (mod > 0) { - new_size = old_size + new_size; - } - - if (new_size < 256 * 1024 * 1024) { - ret = -EINVAL; - goto out_free; - } - if (new_size > device->bdev->bd_inode->i_size) { - ret = -EFBIG; - goto out_free; - } - - do_div(new_size, root->sectorsize); - new_size *= root->sectorsize; - - printk(KERN_INFO "btrfs: new size for %s is %llu\n", - device->name, (unsigned long long)new_size); - - if (new_size > old_size) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_free; - } - ret = btrfs_grow_device(trans, device, new_size); - btrfs_commit_transaction(trans, root); - } else if (new_size < old_size) { - ret = btrfs_shrink_device(device, new_size); - } - -out_free: - kfree(vol_args); -out: - mutex_unlock(&root->fs_info->volume_mutex); - return ret; -} - -static noinline int btrfs_ioctl_snap_create_transid(struct file *file, - char *name, - unsigned long fd, - int subvol, - u64 *transid, - bool readonly) -{ - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; - struct file *src_file; - int namelen; - int ret = 0; - - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - namelen = strlen(name); - if (strchr(name, '/')) { - ret = -EINVAL; - goto out; - } - - if (name[0] == '.' && - (namelen == 1 || (name[1] == '.' && namelen == 2))) { - ret = -EEXIST; - goto out; - } - - if (subvol) { - ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid, readonly); - } else { - struct inode *src_inode; - src_file = fget(fd); - if (!src_file) { - ret = -EINVAL; - goto out; - } - - src_inode = src_file->f_path.dentry->d_inode; - if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { - printk(KERN_INFO "btrfs: Snapshot src from " - "another FS\n"); - ret = -EINVAL; - fput(src_file); - goto out; - } - ret = btrfs_mksubvol(&file->f_path, name, namelen, - BTRFS_I(src_inode)->root, - transid, readonly); - fput(src_file); - } -out: - return ret; -} - -static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) -{ - struct btrfs_ioctl_vol_args *vol_args; - int ret; - - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - NULL, false); - - kfree(vol_args); - return ret; -} - -static noinline int btrfs_ioctl_snap_create_v2(struct file *file, - void __user *arg, int subvol) -{ - struct btrfs_ioctl_vol_args_v2 *vol_args; - int ret; - u64 transid = 0; - u64 *ptr = NULL; - bool readonly = false; - - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - - if (vol_args->flags & - ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { - ret = -EOPNOTSUPP; - goto out; - } - - if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) - ptr = &transid; - if (vol_args->flags & BTRFS_SUBVOL_RDONLY) - readonly = true; - - ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, - vol_args->fd, subvol, - ptr, readonly); - - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) - ret = -EFAULT; -out: - kfree(vol_args); - return ret; -} - -static noinline int btrfs_ioctl_subvol_getflags(struct file *file, - void __user *arg) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - u64 flags = 0; - - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) - return -EINVAL; - - down_read(&root->fs_info->subvol_sem); - if (btrfs_root_readonly(root)) - flags |= BTRFS_SUBVOL_RDONLY; - up_read(&root->fs_info->subvol_sem); - - if (copy_to_user(arg, &flags, sizeof(flags))) - ret = -EFAULT; - - return ret; -} - -static noinline int btrfs_ioctl_subvol_setflags(struct file *file, - void __user *arg) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - u64 root_flags; - u64 flags; - int ret = 0; - - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) - return -EINVAL; - - if (copy_from_user(&flags, arg, sizeof(flags))) - return -EFAULT; - - if (flags & BTRFS_SUBVOL_CREATE_ASYNC) - return -EINVAL; - - if (flags & ~BTRFS_SUBVOL_RDONLY) - return -EOPNOTSUPP; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - down_write(&root->fs_info->subvol_sem); - - /* nothing to do */ - if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) - goto out; - - root_flags = btrfs_root_flags(&root->root_item); - if (flags & BTRFS_SUBVOL_RDONLY) - btrfs_set_root_flags(&root->root_item, - root_flags | BTRFS_ROOT_SUBVOL_RDONLY); - else - btrfs_set_root_flags(&root->root_item, - root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_reset; - } - - ret = btrfs_update_root(trans, root->fs_info->tree_root, - &root->root_key, &root->root_item); - - btrfs_commit_transaction(trans, root); -out_reset: - if (ret) - btrfs_set_root_flags(&root->root_item, root_flags); -out: - up_write(&root->fs_info->subvol_sem); - return ret; -} - -/* - * helper to check if the subvolume references other subvolumes - */ -static noinline int may_destroy_subvol(struct btrfs_root *root) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = root->root_key.objectid; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root->fs_info->tree_root, - &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret == 0); - - ret = 0; - if (path->slots[0] > 0) { - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) - ret = -ENOTEMPTY; - } -out: - btrfs_free_path(path); - return ret; -} - -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) -{ - struct btrfs_key test; - int ret; - - test.objectid = sk->min_objectid; - test.type = sk->min_type; - test.offset = sk->min_offset; - - ret = btrfs_comp_cpu_keys(key, &test); - if (ret < 0) - return 0; - - test.objectid = sk->max_objectid; - test.type = sk->max_type; - test.offset = sk->max_offset; - - ret = btrfs_comp_cpu_keys(key, &test); - if (ret > 0) - return 0; - return 1; -} - -static noinline int copy_to_sk(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, - char *buf, - unsigned long *sk_offset, - int *num_found) -{ - u64 found_transid; - struct extent_buffer *leaf; - struct btrfs_ioctl_search_header sh; - unsigned long item_off; - unsigned long item_len; - int nritems; - int i; - int slot; - int ret = 0; - - leaf = path->nodes[0]; - slot = path->slots[0]; - nritems = btrfs_header_nritems(leaf); - - if (btrfs_header_generation(leaf) > sk->max_transid) { - i = nritems; - goto advance_key; - } - found_transid = btrfs_header_generation(leaf); - - for (i = slot; i < nritems; i++) { - item_off = btrfs_item_ptr_offset(leaf, i); - item_len = btrfs_item_size_nr(leaf, i); - - if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE) - item_len = 0; - - if (sizeof(sh) + item_len + *sk_offset > - BTRFS_SEARCH_ARGS_BUFSIZE) { - ret = 1; - goto overflow; - } - - btrfs_item_key_to_cpu(leaf, key, i); - if (!key_in_sk(key, sk)) - continue; - - sh.objectid = key->objectid; - sh.offset = key->offset; - sh.type = key->type; - sh.len = item_len; - sh.transid = found_transid; - - /* copy search result header */ - memcpy(buf + *sk_offset, &sh, sizeof(sh)); - *sk_offset += sizeof(sh); - - if (item_len) { - char *p = buf + *sk_offset; - /* copy the item */ - read_extent_buffer(leaf, p, - item_off, item_len); - *sk_offset += item_len; - } - (*num_found)++; - - if (*num_found >= sk->nr_items) - break; - } -advance_key: - ret = 0; - if (key->offset < (u64)-1 && key->offset < sk->max_offset) - key->offset++; - else if (key->type < (u8)-1 && key->type < sk->max_type) { - key->offset = 0; - key->type++; - } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { - key->offset = 0; - key->type = 0; - key->objectid++; - } else - ret = 1; -overflow: - return ret; -} - -static noinline int search_ioctl(struct inode *inode, - struct btrfs_ioctl_search_args *args) -{ - struct btrfs_root *root; - struct btrfs_key key; - struct btrfs_key max_key; - struct btrfs_path *path; - struct btrfs_ioctl_search_key *sk = &args->key; - struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; - int ret; - int num_found = 0; - unsigned long sk_offset = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = BTRFS_I(inode)->root; - } else { - key.objectid = sk->tree_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); - if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", - sk->tree_id); - btrfs_free_path(path); - return -ENOENT; - } - } - - key.objectid = sk->min_objectid; - key.type = sk->min_type; - key.offset = sk->min_offset; - - max_key.objectid = sk->max_objectid; - max_key.type = sk->max_type; - max_key.offset = sk->max_offset; - - path->keep_locks = 1; - - while(1) { - ret = btrfs_search_forward(root, &key, &max_key, path, 0, - sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } - ret = copy_to_sk(root, path, &key, sk, args->buf, - &sk_offset, &num_found); - btrfs_release_path(path); - if (ret || num_found >= sk->nr_items) - break; - - } - ret = 0; -err: - sk->nr_items = num_found; - btrfs_free_path(path); - return ret; -} - -static noinline int btrfs_ioctl_tree_search(struct file *file, - void __user *argp) -{ - struct btrfs_ioctl_search_args *args; - struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - args = memdup_user(argp, sizeof(*args)); - if (IS_ERR(args)) - return PTR_ERR(args); - - inode = fdentry(file)->d_inode; - ret = search_ioctl(inode, args); - if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) - ret = -EFAULT; - kfree(args); - return ret; -} - -/* - * Search INODE_REFs to identify path name of 'dirid' directory - * in a 'tree_id' tree. and sets path name to 'name'. - */ -static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, - u64 tree_id, u64 dirid, char *name) -{ - struct btrfs_root *root; - struct btrfs_key key; - char *ptr; - int ret = -1; - int slot; - int len; - int total_len = 0; - struct btrfs_inode_ref *iref; - struct extent_buffer *l; - struct btrfs_path *path; - - if (dirid == BTRFS_FIRST_FREE_OBJECTID) { - name[0]='\0'; - return 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; - - key.objectid = tree_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(info, &key); - if (IS_ERR(root)) { - printk(KERN_ERR "could not find root %llu\n", tree_id); - ret = -ENOENT; - goto out; - } - - key.objectid = dirid; - key.type = BTRFS_INODE_REF_KEY; - key.offset = (u64)-1; - - while(1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - if (ret > 0 && slot > 0) - slot--; - btrfs_item_key_to_cpu(l, &key, slot); - - if (ret > 0 && (key.objectid != dirid || - key.type != BTRFS_INODE_REF_KEY)) { - ret = -ENOENT; - goto out; - } - - iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); - len = btrfs_inode_ref_name_len(l, iref); - ptr -= len + 1; - total_len += len + 1; - if (ptr < name) - goto out; - - *(ptr + len) = '/'; - read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); - - if (key.offset == BTRFS_FIRST_FREE_OBJECTID) - break; - - btrfs_release_path(path); - key.objectid = key.offset; - key.offset = (u64)-1; - dirid = key.objectid; - } - if (ptr < name) - goto out; - memmove(name, ptr, total_len); - name[total_len]='\0'; - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -static noinline int btrfs_ioctl_ino_lookup(struct file *file, - void __user *argp) -{ - struct btrfs_ioctl_ino_lookup_args *args; - struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - args = memdup_user(argp, sizeof(*args)); - if (IS_ERR(args)) - return PTR_ERR(args); - - inode = fdentry(file)->d_inode; - - if (args->treeid == 0) - args->treeid = BTRFS_I(inode)->root->root_key.objectid; - - ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, - args->treeid, args->objectid, - args->name); - - if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) - ret = -EFAULT; - - kfree(args); - return ret; -} - -static noinline int btrfs_ioctl_snap_destroy(struct file *file, - void __user *arg) -{ - struct dentry *parent = fdentry(file); - struct dentry *dentry; - struct inode *dir = parent->d_inode; - struct inode *inode; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_root *dest = NULL; - struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; - int namelen; - int ret; - int err = 0; - - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/') || - strncmp(vol_args->name, "..", namelen) == 0) { - err = -EINVAL; - goto out; - } - - err = mnt_want_write_file(file); - if (err) - goto out; - - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(vol_args->name, parent, namelen); - if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); - goto out_unlock_dir; - } - - if (!dentry->d_inode) { - err = -ENOENT; - goto out_dput; - } - - inode = dentry->d_inode; - dest = BTRFS_I(inode)->root; - if (!capable(CAP_SYS_ADMIN)){ - /* - * Regular user. Only allow this with a special mount - * option, when the user has write+exec access to the - * subvol root, and when rmdir(2) would have been - * allowed. - * - * Note that this is _not_ check that the subvol is - * empty or doesn't contain data that we wouldn't - * otherwise be able to delete. - * - * Users who want to delete empty subvols should try - * rmdir(2). - */ - err = -EPERM; - if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) - goto out_dput; - - /* - * Do not allow deletion if the parent dir is the same - * as the dir to be deleted. That means the ioctl - * must be called on the dentry referencing the root - * of the subvol, not a random directory contained - * within it. - */ - err = -EINVAL; - if (root == dest) - goto out_dput; - - err = inode_permission(inode, MAY_WRITE | MAY_EXEC); - if (err) - goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; - } - - if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; - goto out_dput; - } - - mutex_lock(&inode->i_mutex); - err = d_invalidate(dentry); - if (err) - goto out_unlock; - - down_write(&root->fs_info->subvol_sem); - - err = may_destroy_subvol(dest); - if (err) - goto out_up_write; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_up_write; - } - trans->block_rsv = &root->fs_info->global_block_rsv; - - ret = btrfs_unlink_subvol(trans, root, dir, - dest->root_key.objectid, - dentry->d_name.name, - dentry->d_name.len); - if (ret) { - err = ret; - btrfs_abort_transaction(trans, root, ret); - goto out_end_trans; - } - - btrfs_record_root_in_trans(trans, dest); - - memset(&dest->root_item.drop_progress, 0, - sizeof(dest->root_item.drop_progress)); - dest->root_item.drop_level = 0; - btrfs_set_root_refs(&dest->root_item, 0); - - if (!xchg(&dest->orphan_item_inserted, 1)) { - ret = btrfs_insert_orphan_item(trans, - root->fs_info->tree_root, - dest->root_key.objectid); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - err = ret; - goto out_end_trans; - } - } -out_end_trans: - ret = btrfs_end_transaction(trans, root); - if (ret && !err) - err = ret; - inode->i_flags |= S_DEAD; -out_up_write: - up_write(&root->fs_info->subvol_sem); -out_unlock: - mutex_unlock(&inode->i_mutex); - if (!err) { - shrink_dcache_sb(root->fs_info->sb); - btrfs_invalidate_inodes(dest); - d_delete(dentry); - } -out_dput: - dput(dentry); -out_unlock_dir: - mutex_unlock(&dir->i_mutex); - mnt_drop_write_file(file); -out: - kfree(vol_args); - return err; -} - -static int btrfs_ioctl_defrag(struct file *file, void __user *argp) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ioctl_defrag_range_args *range; - int ret; - - if (btrfs_root_readonly(root)) - return -EROFS; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - switch (inode->i_mode & S_IFMT) { - case S_IFDIR: - if (!capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto out; - } - ret = btrfs_defrag_root(root, 0); - if (ret) - goto out; - ret = btrfs_defrag_root(root->fs_info->extent_root, 0); - break; - case S_IFREG: - if (!(file->f_mode & FMODE_WRITE)) { - ret = -EINVAL; - goto out; - } - - range = kzalloc(sizeof(*range), GFP_KERNEL); - if (!range) { - ret = -ENOMEM; - goto out; - } - - if (argp) { - if (copy_from_user(range, argp, - sizeof(*range))) { - ret = -EFAULT; - kfree(range); - goto out; - } - /* compression requires us to start the IO */ - if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { - range->flags |= BTRFS_DEFRAG_RANGE_START_IO; - range->extent_thresh = (u32)-1; - } - } else { - /* the rest are all set to zero by kzalloc */ - range->len = (u64)-1; - } - ret = btrfs_defrag_file(fdentry(file)->d_inode, file, - range, 0, 0); - if (ret > 0) - ret = 0; - kfree(range); - break; - default: - ret = -EINVAL; - } -out: - mnt_drop_write_file(file); - return ret; -} - -static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_vol_args *vol_args; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; - } - - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_init_new_device(root, vol_args->name); - - kfree(vol_args); -out: - mutex_unlock(&root->fs_info->volume_mutex); - return ret; -} - -static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_vol_args *vol_args; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (root->fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - mutex_lock(&root->fs_info->volume_mutex); - if (root->fs_info->balance_ctl) { - printk(KERN_INFO "btrfs: balance in progress\n"); - ret = -EINVAL; - goto out; - } - - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out; - } - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(root, vol_args->name); - - kfree(vol_args); -out: - mutex_unlock(&root->fs_info->volume_mutex); - return ret; -} - -static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_fs_info_args *fi_args; - struct btrfs_device *device; - struct btrfs_device *next; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); - if (!fi_args) - return -ENOMEM; - - fi_args->num_devices = fs_devices->num_devices; - memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->devid > fi_args->max_id) - fi_args->max_id = device->devid; - } - mutex_unlock(&fs_devices->device_list_mutex); - - if (copy_to_user(arg, fi_args, sizeof(*fi_args))) - ret = -EFAULT; - - kfree(fi_args); - return ret; -} - -static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_dev_info_args *di_args; - struct btrfs_device *dev; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - int ret = 0; - char *s_uuid = NULL; - char empty_uuid[BTRFS_UUID_SIZE] = {0}; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - di_args = memdup_user(arg, sizeof(*di_args)); - if (IS_ERR(di_args)) - return PTR_ERR(di_args); - - if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0) - s_uuid = di_args->uuid; - - mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); - mutex_unlock(&fs_devices->device_list_mutex); - - if (!dev) { - ret = -ENODEV; - goto out; - } - - di_args->devid = dev->devid; - di_args->bytes_used = dev->bytes_used; - di_args->total_bytes = dev->total_bytes; - memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) - strncpy(di_args->path, dev->name, sizeof(di_args->path)); - else - di_args->path[0] = '\0'; - -out: - if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) - ret = -EFAULT; - - kfree(di_args); - return ret; -} - -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct file *src_file; - struct inode *src; - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct extent_buffer *leaf; - char *buf; - struct btrfs_key key; - u32 nritems; - int slot; - int ret; - u64 len = olen; - u64 bs = root->fs_info->sb->s_blocksize; - u64 hint_byte; - - /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? - */ - - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) - return -EINVAL; - - if (btrfs_root_readonly(root)) - return -EROFS; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - src_file = fget(srcfd); - if (!src_file) { - ret = -EBADF; - goto out_drop_write; - } - - src = src_file->f_dentry->d_inode; - - ret = -EINVAL; - if (src == inode) - goto out_fput; - - /* the src must be open for reading */ - if (!(src_file->f_mode & FMODE_READ)) - goto out_fput; - - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto out_fput; - - ret = -EISDIR; - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; - - ret = -EXDEV; - if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) - goto out_fput; - - ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); - if (!buf) - goto out_fput; - - path = btrfs_alloc_path(); - if (!path) { - vfree(buf); - goto out_fput; - } - path->reada = 2; - - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } - - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ - if (off + len == src->i_size) - len = ALIGN(src->i_size, bs) - off; - - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; - - if (destoff > inode->i_size) { - ret = btrfs_cont_expand(inode, inode->i_size, destoff); - if (ret) - goto out_unlock; - } - - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); - - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off+len); - ordered = btrfs_lookup_first_ordered_extent(src, off+len); - if (!ordered && - !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, - EXTENT_DELALLOC, 0, NULL)) - break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); - if (ordered) - btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, off, len); - } - - /* clone data */ - key.objectid = btrfs_ino(src); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - - while (1) { - /* - * note the key will change type as we walk through the - * tree. - */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - nritems = btrfs_header_nritems(path->nodes[0]); - } - leaf = path->nodes[0]; - slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || - key.objectid != btrfs_ino(src)) - break; - - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { - struct btrfs_file_extent_item *extent; - int type; - u32 size; - struct btrfs_key new_key; - u64 disko = 0, diskl = 0; - u64 datao = 0, datal = 0; - u8 comp; - u64 endoff; - - size = btrfs_item_size_nr(leaf, slot); - read_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - comp = btrfs_file_extent_compression(leaf, extent); - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - disko = btrfs_file_extent_disk_bytenr(leaf, - extent); - diskl = btrfs_file_extent_disk_num_bytes(leaf, - extent); - datao = btrfs_file_extent_offset(leaf, extent); - datal = btrfs_file_extent_num_bytes(leaf, - extent); - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - /* take upper bound, may be compressed */ - datal = btrfs_file_extent_ram_bytes(leaf, - extent); - } - btrfs_release_path(path); - - if (key.offset + datal <= off || - key.offset >= off+len) - goto next; - - memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = btrfs_ino(inode); - if (off <= key.offset) - new_key.offset = key.offset + destoff - off; - else - new_key.offset = destoff; - - /* - * 1 - adjusting old extent (we may have to split it) - * 1 - add new extent - * 1 - inode update - */ - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - if (type == BTRFS_FILE_EXTENT_REG || - type == BTRFS_FILE_EXTENT_PREALLOC) { - /* - * a | --- range to clone ---| b - * | ------------- extent ------------- | - */ - - /* substract range b */ - if (key.offset + datal > off + len) - datal = off + len - key.offset; - - /* substract range a */ - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - ret = btrfs_drop_extents(trans, inode, - new_key.offset, - new_key.offset + datal, - &hint_byte, 1); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); - btrfs_end_transaction(trans, root); - goto out; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - /* disko == 0 means it's a hole */ - if (!disko) - datao = 0; - - btrfs_set_file_extent_offset(leaf, extent, - datao); - btrfs_set_file_extent_num_bytes(leaf, extent, - datal); - if (disko) { - inode_add_bytes(inode, datal); - ret = btrfs_inc_extent_ref(trans, root, - disko, diskl, 0, - root->root_key.objectid, - btrfs_ino(inode), - new_key.offset - datao, - 0); - if (ret) { - btrfs_abort_transaction(trans, - root, - ret); - btrfs_end_transaction(trans, - root); - goto out; - - } - } - } else if (type == BTRFS_FILE_EXTENT_INLINE) { - u64 skip = 0; - u64 trim = 0; - if (off > key.offset) { - skip = off - key.offset; - new_key.offset += skip; - } - - if (key.offset + datal > off+len) - trim = key.offset + datal - (off+len); - - if (comp && (skip || trim)) { - ret = -EINVAL; - btrfs_end_transaction(trans, root); - goto out; - } - size -= skip + trim; - datal -= skip + trim; - - ret = btrfs_drop_extents(trans, inode, - new_key.offset, - new_key.offset + datal, - &hint_byte, 1); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); - btrfs_end_transaction(trans, root); - goto out; - } - - if (skip) { - u32 start = - btrfs_file_extent_calc_inline_size(0); - memmove(buf+start, buf+start+skip, - datal); - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - inode_add_bytes(inode, datal); - } - - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); - - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - - /* - * we round up to the block size at eof when - * determining which extents to clone above, - * but shouldn't round up the file size - */ - endoff = new_key.offset + datal; - if (endoff > destoff+olen) - endoff = destoff+olen; - if (endoff > inode->i_size) - btrfs_i_size_write(inode, endoff); - - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - goto out; - } - ret = btrfs_end_transaction(trans, root); - } -next: - btrfs_release_path(path); - key.offset++; - } - ret = 0; -out: - btrfs_release_path(path); - unlock_extent(&BTRFS_I(src)->io_tree, off, off+len); -out_unlock: - mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); - vfree(buf); - btrfs_free_path(path); -out_fput: - fput(src_file); -out_drop_write: - mnt_drop_write_file(file); - return ret; -} - -static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) -{ - struct btrfs_ioctl_clone_range_args args; - - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, - args.src_length, args.dest_offset); -} - -/* - * there are many ways the trans_start and trans_end ioctls can lead - * to deadlocks. They should only be used by applications that - * basically own the machine, and have a very in depth understanding - * of all the possible deadlocks and enospc problems. - */ -static long btrfs_ioctl_trans_start(struct file *file) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - int ret; - - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - - ret = -EINPROGRESS; - if (file->private_data) - goto out; - - ret = -EROFS; - if (btrfs_root_readonly(root)) - goto out; - - ret = mnt_want_write_file(file); - if (ret) - goto out; - - atomic_inc(&root->fs_info->open_ioctl_trans); - - ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root); - if (IS_ERR(trans)) - goto out_drop; - - file->private_data = trans; - return 0; - -out_drop: - atomic_dec(&root->fs_info->open_ioctl_trans); - mnt_drop_write_file(file); -out: - return ret; -} - -static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_key location; - struct btrfs_disk_key disk_key; - struct btrfs_super_block *disk_super; - u64 features; - u64 objectid = 0; - u64 dir_id; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&objectid, argp, sizeof(objectid))) - return -EFAULT; - - if (!objectid) - objectid = root->root_key.objectid; - - location.objectid = objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - - new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); - if (IS_ERR(new_root)) - return PTR_ERR(new_root); - - if (btrfs_root_refs(&new_root->root_item) == 0) - return -ENOENT; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->leave_spinning = 1; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - - dir_id = btrfs_super_root_dir(root->fs_info->super_copy); - di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, - dir_id, "default", 7, 1); - if (IS_ERR_OR_NULL(di)) { - btrfs_free_path(path); - btrfs_end_transaction(trans, root); - printk(KERN_ERR "Umm, you don't have the default dir item, " - "this isn't going to work\n"); - return -ENOENT; - } - - btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); - btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); - - disk_super = root->fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { - features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; - btrfs_set_super_incompat_flags(disk_super, features); - } - btrfs_end_transaction(trans, root); - - return 0; -} - -static void get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space) -{ - struct btrfs_block_group_cache *block_group; - - space->total_bytes = 0; - space->used_bytes = 0; - space->flags = 0; - list_for_each_entry(block_group, groups_list, list) { - space->flags = block_group->flags; - space->total_bytes += block_group->key.offset; - space->used_bytes += - btrfs_block_group_used(&block_group->item); - } -} - -long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_ioctl_space_args space_args; - struct btrfs_ioctl_space_info space; - struct btrfs_ioctl_space_info *dest; - struct btrfs_ioctl_space_info *dest_orig; - struct btrfs_ioctl_space_info __user *user_dest; - struct btrfs_space_info *info; - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, - BTRFS_BLOCK_GROUP_SYSTEM, - BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; - int num_types = 4; - int alloc_size; - int ret = 0; - u64 slot_count = 0; - int i, c; - - if (copy_from_user(&space_args, - (struct btrfs_ioctl_space_args __user *)arg, - sizeof(space_args))) - return -EFAULT; - - for (i = 0; i < num_types; i++) { - struct btrfs_space_info *tmp; - - info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &root->fs_info->space_info, - list) { - if (tmp->flags == types[i]) { - info = tmp; - break; - } - } - rcu_read_unlock(); - - if (!info) - continue; - - down_read(&info->groups_sem); - for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { - if (!list_empty(&info->block_groups[c])) - slot_count++; - } - up_read(&info->groups_sem); - } - - /* space_slots == 0 means they are asking for a count */ - if (space_args.space_slots == 0) { - space_args.total_spaces = slot_count; - goto out; - } - - slot_count = min_t(u64, space_args.space_slots, slot_count); - - alloc_size = sizeof(*dest) * slot_count; - - /* we generally have at most 6 or so space infos, one for each raid - * level. So, a whole page should be more than enough for everyone - */ - if (alloc_size > PAGE_CACHE_SIZE) - return -ENOMEM; - - space_args.total_spaces = 0; - dest = kmalloc(alloc_size, GFP_NOFS); - if (!dest) - return -ENOMEM; - dest_orig = dest; - - /* now we have a buffer to copy into */ - for (i = 0; i < num_types; i++) { - struct btrfs_space_info *tmp; - - if (!slot_count) - break; - - info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &root->fs_info->space_info, - list) { - if (tmp->flags == types[i]) { - info = tmp; - break; - } - } - rcu_read_unlock(); - - if (!info) - continue; - down_read(&info->groups_sem); - for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { - if (!list_empty(&info->block_groups[c])) { - get_block_group_info(&info->block_groups[c], - &space); - memcpy(dest, &space, sizeof(space)); - dest++; - space_args.total_spaces++; - slot_count--; - } - if (!slot_count) - break; - } - up_read(&info->groups_sem); - } - - user_dest = (struct btrfs_ioctl_space_info *) - (arg + sizeof(struct btrfs_ioctl_space_args)); - - if (copy_to_user(user_dest, dest_orig, alloc_size)) - ret = -EFAULT; - - kfree(dest_orig); -out: - if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) - ret = -EFAULT; - - return ret; -} - -/* - * there are many ways the trans_start and trans_end ioctls can lead - * to deadlocks. They should only be used by applications that - * basically own the machine, and have a very in depth understanding - * of all the possible deadlocks and enospc problems. - */ -long btrfs_ioctl_trans_end(struct file *file) -{ - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - - trans = file->private_data; - if (!trans) - return -EINVAL; - file->private_data = NULL; - - btrfs_end_transaction(trans, root); - - atomic_dec(&root->fs_info->open_ioctl_trans); - - mnt_drop_write_file(file); - return 0; -} - -static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) -{ - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; - struct btrfs_trans_handle *trans; - u64 transid; - int ret; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, root, 0); - if (ret) { - btrfs_end_transaction(trans, root); - return ret; - } - - if (argp) - if (copy_to_user(argp, &transid, sizeof(transid))) - return -EFAULT; - return 0; -} - -static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) -{ - struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; - u64 transid; - - if (argp) { - if (copy_from_user(&transid, argp, sizeof(transid))) - return -EFAULT; - } else { - transid = 0; /* current trans */ - } - return btrfs_wait_for_commit(root, transid); -} - -static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) -{ - int ret; - struct btrfs_ioctl_scrub_args *sa; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); - - ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, - &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); - - if (copy_to_user(arg, sa, sizeof(*sa))) - ret = -EFAULT; - - kfree(sa); - return ret; -} - -static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - return btrfs_scrub_cancel(root); -} - -static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, - void __user *arg) -{ - struct btrfs_ioctl_scrub_args *sa; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) - return PTR_ERR(sa); - - ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); - - if (copy_to_user(arg, sa, sizeof(*sa))) - ret = -EFAULT; - - kfree(sa); - return ret; -} - -static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) -{ - int ret = 0; - int i; - u64 rel_ptr; - int size; - struct btrfs_ioctl_ino_path_args *ipa = NULL; - struct inode_fs_paths *ipath = NULL; - struct btrfs_path *path; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - ipa = memdup_user(arg, sizeof(*ipa)); - if (IS_ERR(ipa)) { - ret = PTR_ERR(ipa); - ipa = NULL; - goto out; - } - - size = min_t(u32, ipa->size, 4096); - ipath = init_ipath(size, root, path); - if (IS_ERR(ipath)) { - ret = PTR_ERR(ipath); - ipath = NULL; - goto out; - } - - ret = paths_from_inode(ipa->inum, ipath); - if (ret < 0) - goto out; - - for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - rel_ptr = ipath->fspath->val[i] - - (u64)(unsigned long)ipath->fspath->val; - ipath->fspath->val[i] = rel_ptr; - } - - ret = copy_to_user((void *)(unsigned long)ipa->fspath, - (void *)(unsigned long)ipath->fspath, size); - if (ret) { - ret = -EFAULT; - goto out; - } - -out: - btrfs_free_path(path); - free_ipath(ipath); - kfree(ipa); - - return ret; -} - -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - -static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, - void __user *arg) -{ - int ret = 0; - int size; - u64 extent_item_pos; - struct btrfs_ioctl_logical_ino_args *loi; - struct btrfs_data_container *inodes = NULL; - struct btrfs_path *path = NULL; - struct btrfs_key key; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - loi = memdup_user(arg, sizeof(*loi)); - if (IS_ERR(loi)) { - ret = PTR_ERR(loi); - loi = NULL; - goto out; - } - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - size = min_t(u32, loi->size, 4096); - inodes = init_data_container(size); - if (IS_ERR(inodes)) { - ret = PTR_ERR(inodes); - inodes = NULL; - goto out; - } - - ret = extent_from_logical(root->fs_info, loi->logical, path, &key); - btrfs_release_path(path); - - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = -ENOENT; - if (ret < 0) - goto out; - - extent_item_pos = loi->logical - key.objectid; - ret = iterate_extent_inodes(root->fs_info, key.objectid, - extent_item_pos, 0, build_ino_list, - inodes); - - if (ret < 0) - goto out; - - ret = copy_to_user((void *)(unsigned long)loi->inodes, - (void *)(unsigned long)inodes, size); - if (ret) - ret = -EFAULT; - -out: - btrfs_free_path(path); - kfree(inodes); - kfree(loi); - - return ret; -} - -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, - struct btrfs_ioctl_balance_args *bargs) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - - bargs->flags = bctl->flags; - - if (atomic_read(&fs_info->balance_running)) - bargs->state |= BTRFS_BALANCE_STATE_RUNNING; - if (atomic_read(&fs_info->balance_pause_req)) - bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; - if (atomic_read(&fs_info->balance_cancel_req)) - bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; - - memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); - memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); - memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); - - if (lock) { - spin_lock(&fs_info->balance_lock); - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - spin_unlock(&fs_info->balance_lock); - } else { - memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); - } -} - -static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ioctl_balance_args *bargs; - struct btrfs_balance_control *bctl; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (fs_info->sb->s_flags & MS_RDONLY) - return -EROFS; - - mutex_lock(&fs_info->volume_mutex); - mutex_lock(&fs_info->balance_mutex); - - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); - goto out; - } - - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } - - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); - - goto do_balance; - } - } else { - bargs = NULL; - } - - if (fs_info->balance_ctl) { - ret = -EINPROGRESS; - goto out_bargs; - } - - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out_bargs; - } - - bctl->fs_info = fs_info; - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; - } - -do_balance: - ret = btrfs_balance(bctl, bargs); - /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount - */ - if (arg) { - if (copy_to_user(arg, bargs, sizeof(*bargs))) - ret = -EFAULT; - } - -out_bargs: - kfree(bargs); -out: - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); - return ret; -} - -static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case BTRFS_BALANCE_CTL_PAUSE: - return btrfs_pause_balance(root->fs_info); - case BTRFS_BALANCE_CTL_CANCEL: - return btrfs_cancel_balance(root->fs_info); - } - - return -EINVAL; -} - -static long btrfs_ioctl_balance_progress(struct btrfs_root *root, - void __user *arg) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ioctl_balance_args *bargs; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&fs_info->balance_mutex); - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out; - } - - bargs = kzalloc(sizeof(*bargs), GFP_NOFS); - if (!bargs) { - ret = -ENOMEM; - goto out; - } - - update_ioctl_balance_args(fs_info, 1, bargs); - - if (copy_to_user(arg, bargs, sizeof(*bargs))) - ret = -EFAULT; - - kfree(bargs); -out: - mutex_unlock(&fs_info->balance_mutex); - return ret; -} - -long btrfs_ioctl(struct file *file, unsigned int - cmd, unsigned long arg) -{ - struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; - void __user *argp = (void __user *)arg; - - switch (cmd) { - case FS_IOC_GETFLAGS: - return btrfs_ioctl_getflags(file, argp); - case FS_IOC_SETFLAGS: - return btrfs_ioctl_setflags(file, argp); - case FS_IOC_GETVERSION: - return btrfs_ioctl_getversion(file, argp); - case FITRIM: - return btrfs_ioctl_fitrim(file, argp); - case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); - case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create_v2(file, argp, 0); - case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); - case BTRFS_IOC_SNAP_DESTROY: - return btrfs_ioctl_snap_destroy(file, argp); - case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(file, argp); - case BTRFS_IOC_SUBVOL_SETFLAGS: - return btrfs_ioctl_subvol_setflags(file, argp); - case BTRFS_IOC_DEFAULT_SUBVOL: - return btrfs_ioctl_default_subvol(file, argp); - case BTRFS_IOC_DEFRAG: - return btrfs_ioctl_defrag(file, NULL); - case BTRFS_IOC_DEFRAG_RANGE: - return btrfs_ioctl_defrag(file, argp); - case BTRFS_IOC_RESIZE: - return btrfs_ioctl_resize(root, argp); - case BTRFS_IOC_ADD_DEV: - return btrfs_ioctl_add_dev(root, argp); - case BTRFS_IOC_RM_DEV: - return btrfs_ioctl_rm_dev(root, argp); - case BTRFS_IOC_FS_INFO: - return btrfs_ioctl_fs_info(root, argp); - case BTRFS_IOC_DEV_INFO: - return btrfs_ioctl_dev_info(root, argp); - case BTRFS_IOC_BALANCE: - return btrfs_ioctl_balance(root, NULL); - case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return btrfs_ioctl_clone_range(file, argp); - case BTRFS_IOC_TRANS_START: - return btrfs_ioctl_trans_start(file); - case BTRFS_IOC_TRANS_END: - return btrfs_ioctl_trans_end(file); - case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(file, argp); - case BTRFS_IOC_INO_LOOKUP: - return btrfs_ioctl_ino_lookup(file, argp); - case BTRFS_IOC_INO_PATHS: - return btrfs_ioctl_ino_to_path(root, argp); - case BTRFS_IOC_LOGICAL_INO: - return btrfs_ioctl_logical_to_ino(root, argp); - case BTRFS_IOC_SPACE_INFO: - return btrfs_ioctl_space_info(root, argp); - case BTRFS_IOC_SYNC: - btrfs_sync_fs(file->f_dentry->d_sb, 1); - return 0; - case BTRFS_IOC_START_SYNC: - return btrfs_ioctl_start_sync(file, argp); - case BTRFS_IOC_WAIT_SYNC: - return btrfs_ioctl_wait_sync(file, argp); - case BTRFS_IOC_SCRUB: - return btrfs_ioctl_scrub(root, argp); - case BTRFS_IOC_SCRUB_CANCEL: - return btrfs_ioctl_scrub_cancel(root, argp); - case BTRFS_IOC_SCRUB_PROGRESS: - return btrfs_ioctl_scrub_progress(root, argp); - case BTRFS_IOC_BALANCE_V2: - return btrfs_ioctl_balance(root, argp); - case BTRFS_IOC_BALANCE_CTL: - return btrfs_ioctl_balance_ctl(root, arg); - case BTRFS_IOC_BALANCE_PROGRESS: - return btrfs_ioctl_balance_progress(root, argp); - } - - return -ENOTTY; -} diff --git a/ANDROID_3.4.5/fs/btrfs/ioctl.h b/ANDROID_3.4.5/fs/btrfs/ioctl.h deleted file mode 100644 index 086e6bda..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ioctl.h +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __IOCTL_ -#define __IOCTL_ -#include <linux/ioctl.h> - -#define BTRFS_IOCTL_MAGIC 0x94 -#define BTRFS_VOL_NAME_MAX 255 - -/* this should be 4k */ -#define BTRFS_PATH_NAME_MAX 4087 -struct btrfs_ioctl_vol_args { - __s64 fd; - char name[BTRFS_PATH_NAME_MAX + 1]; -}; - -#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) -#define BTRFS_SUBVOL_RDONLY (1ULL << 1) -#define BTRFS_FSID_SIZE 16 -#define BTRFS_UUID_SIZE 16 - -#define BTRFS_SUBVOL_NAME_MAX 4039 -struct btrfs_ioctl_vol_args_v2 { - __s64 fd; - __u64 transid; - __u64 flags; - __u64 unused[4]; - char name[BTRFS_SUBVOL_NAME_MAX + 1]; -}; - -/* - * structure to report errors and progress to userspace, either as a - * result of a finished scrub, a canceled scrub or a progress inquiry - */ -struct btrfs_scrub_progress { - __u64 data_extents_scrubbed; /* # of data extents scrubbed */ - __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ - __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ - __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ - __u64 read_errors; /* # of read errors encountered (EIO) */ - __u64 csum_errors; /* # of failed csum checks */ - __u64 verify_errors; /* # of occurences, where the metadata - * of a tree block did not match the - * expected values, like generation or - * logical */ - __u64 no_csum; /* # of 4k data block for which no csum - * is present, probably the result of - * data written with nodatasum */ - __u64 csum_discards; /* # of csum for which no data was found - * in the extent tree. */ - __u64 super_errors; /* # of bad super blocks encountered */ - __u64 malloc_errors; /* # of internal kmalloc errors. These - * will likely cause an incomplete - * scrub */ - __u64 uncorrectable_errors; /* # of errors where either no intact - * copy was found or the writeback - * failed */ - __u64 corrected_errors; /* # of errors corrected */ - __u64 last_physical; /* last physical address scrubbed. In - * case a scrub was aborted, this can - * be used to restart the scrub */ - __u64 unverified_errors; /* # of occurences where a read for a - * full (64k) bio failed, but the re- - * check succeeded for each 4k piece. - * Intermittent error. */ -}; - -#define BTRFS_SCRUB_READONLY 1 -struct btrfs_ioctl_scrub_args { - __u64 devid; /* in */ - __u64 start; /* in */ - __u64 end; /* in */ - __u64 flags; /* in */ - struct btrfs_scrub_progress progress; /* out */ - /* pad to 1k */ - __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; -}; - -#define BTRFS_DEVICE_PATH_NAME_MAX 1024 -struct btrfs_ioctl_dev_info_args { - __u64 devid; /* in/out */ - __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ - __u64 bytes_used; /* out */ - __u64 total_bytes; /* out */ - __u64 unused[379]; /* pad to 4k */ - __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ -}; - -struct btrfs_ioctl_fs_info_args { - __u64 max_id; /* out */ - __u64 num_devices; /* out */ - __u8 fsid[BTRFS_FSID_SIZE]; /* out */ - __u64 reserved[124]; /* pad to 1k */ -}; - -/* balance control ioctl modes */ -#define BTRFS_BALANCE_CTL_PAUSE 1 -#define BTRFS_BALANCE_CTL_CANCEL 2 - -/* - * this is packed, because it should be exactly the same as its disk - * byte order counterpart (struct btrfs_disk_balance_args) - */ -struct btrfs_balance_args { - __u64 profiles; - __u64 usage; - __u64 devid; - __u64 pstart; - __u64 pend; - __u64 vstart; - __u64 vend; - - __u64 target; - - __u64 flags; - - __u64 unused[8]; -} __attribute__ ((__packed__)); - -/* report balance progress to userspace */ -struct btrfs_balance_progress { - __u64 expected; /* estimated # of chunks that will be - * relocated to fulfill the request */ - __u64 considered; /* # of chunks we have considered so far */ - __u64 completed; /* # of chunks relocated so far */ -}; - -#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) -#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) -#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) - -struct btrfs_ioctl_balance_args { - __u64 flags; /* in/out */ - __u64 state; /* out */ - - struct btrfs_balance_args data; /* in/out */ - struct btrfs_balance_args meta; /* in/out */ - struct btrfs_balance_args sys; /* in/out */ - - struct btrfs_balance_progress stat; /* out */ - - __u64 unused[72]; /* pad to 1k */ -}; - -#define BTRFS_INO_LOOKUP_PATH_MAX 4080 -struct btrfs_ioctl_ino_lookup_args { - __u64 treeid; - __u64 objectid; - char name[BTRFS_INO_LOOKUP_PATH_MAX]; -}; - -struct btrfs_ioctl_search_key { - /* which root are we searching. 0 is the tree of tree roots */ - __u64 tree_id; - - /* keys returned will be >= min and <= max */ - __u64 min_objectid; - __u64 max_objectid; - - /* keys returned will be >= min and <= max */ - __u64 min_offset; - __u64 max_offset; - - /* max and min transids to search for */ - __u64 min_transid; - __u64 max_transid; - - /* keys returned will be >= min and <= max */ - __u32 min_type; - __u32 max_type; - - /* - * how many items did userland ask for, and how many are we - * returning - */ - __u32 nr_items; - - /* align to 64 bits */ - __u32 unused; - - /* some extra for later */ - __u64 unused1; - __u64 unused2; - __u64 unused3; - __u64 unused4; -}; - -struct btrfs_ioctl_search_header { - __u64 transid; - __u64 objectid; - __u64 offset; - __u32 type; - __u32 len; -}; - -#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) -/* - * the buf is an array of search headers where - * each header is followed by the actual item - * the type field is expanded to 32 bits for alignment - */ -struct btrfs_ioctl_search_args { - struct btrfs_ioctl_search_key key; - char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; -}; - -struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; -}; - -/* flags for the defrag range ioctl */ -#define BTRFS_DEFRAG_RANGE_COMPRESS 1 -#define BTRFS_DEFRAG_RANGE_START_IO 2 - -struct btrfs_ioctl_space_info { - __u64 flags; - __u64 total_bytes; - __u64 used_bytes; -}; - -struct btrfs_ioctl_space_args { - __u64 space_slots; - __u64 total_spaces; - struct btrfs_ioctl_space_info spaces[0]; -}; - -struct btrfs_data_container { - __u32 bytes_left; /* out -- bytes not needed to deliver output */ - __u32 bytes_missing; /* out -- additional bytes needed for result */ - __u32 elem_cnt; /* out */ - __u32 elem_missed; /* out */ - __u64 val[0]; /* out */ -}; - -struct btrfs_ioctl_ino_path_args { - __u64 inum; /* in */ - __u64 size; /* in */ - __u64 reserved[4]; - /* struct btrfs_data_container *fspath; out */ - __u64 fspath; /* out */ -}; - -struct btrfs_ioctl_logical_ino_args { - __u64 logical; /* in */ - __u64 size; /* in */ - __u64 reserved[4]; - /* struct btrfs_data_container *inodes; out */ - __u64 inodes; -}; - -#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ - struct btrfs_ioctl_vol_args) -/* trans start and trans end are dangerous, and only for - * use by applications that know how to avoid the - * resulting deadlocks - */ -#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) -#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) -#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) - -#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) -#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ - struct btrfs_ioctl_vol_args) - -#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ - struct btrfs_ioctl_clone_range_args) - -#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ - struct btrfs_ioctl_vol_args) -#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ - struct btrfs_ioctl_defrag_range_args) -#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ - struct btrfs_ioctl_search_args) -#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ - struct btrfs_ioctl_ino_lookup_args) -#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) -#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ - struct btrfs_ioctl_space_args) -#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) -#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ - struct btrfs_ioctl_vol_args_v2) -#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) -#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) -#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ - struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) -#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ - struct btrfs_ioctl_scrub_args) -#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ - struct btrfs_ioctl_dev_info_args) -#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ - struct btrfs_ioctl_fs_info_args) -#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) -#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ - struct btrfs_ioctl_balance_args) -#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ - struct btrfs_ioctl_ino_path_args) -#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ - struct btrfs_ioctl_ino_path_args) - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/locking.c b/ANDROID_3.4.5/fs/btrfs/locking.c deleted file mode 100644 index 272f9112..00000000 --- a/ANDROID_3.4.5/fs/btrfs/locking.c +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/spinlock.h> -#include <linux/page-flags.h> -#include <asm/bug.h> -#include "ctree.h" -#include "extent_io.h" -#include "locking.h" - -void btrfs_assert_tree_read_locked(struct extent_buffer *eb); - -/* - * if we currently have a spinning reader or writer lock - * (indicated by the rw flag) this will bump the count - * of blocking holders and drop the spinlock. - */ -void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) -{ - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } - if (rw == BTRFS_WRITE_LOCK) { - if (atomic_read(&eb->blocking_writers) == 0) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); - btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); - write_unlock(&eb->lock); - } - } else if (rw == BTRFS_READ_LOCK) { - btrfs_assert_tree_read_locked(eb); - atomic_inc(&eb->blocking_readers); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); - read_unlock(&eb->lock); - } - return; -} - -/* - * if we currently have a blocking lock, take the spinlock - * and drop our blocking count - */ -void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) -{ - if (eb->lock_nested) { - read_lock(&eb->lock); - if (&eb->lock_nested && current->pid == eb->lock_owner) { - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } - if (rw == BTRFS_WRITE_LOCK_BLOCKING) { - BUG_ON(atomic_read(&eb->blocking_writers) != 1); - write_lock(&eb->lock); - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - if (atomic_dec_and_test(&eb->blocking_writers)) - wake_up(&eb->write_lock_wq); - } else if (rw == BTRFS_READ_LOCK_BLOCKING) { - BUG_ON(atomic_read(&eb->blocking_readers) == 0); - read_lock(&eb->lock); - atomic_inc(&eb->spinning_readers); - if (atomic_dec_and_test(&eb->blocking_readers)) - wake_up(&eb->read_lock_wq); - } - return; -} - -/* - * take a spinning read lock. This will wait for any blocking - * writers - */ -void btrfs_tree_read_lock(struct extent_buffer *eb) -{ -again: - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner) { - /* - * This extent is already write-locked by our thread. We allow - * an additional read lock to be added because it's for the same - * thread. btrfs_find_all_roots() depends on this as it may be - * called on a partly (write-)locked tree. - */ - BUG_ON(eb->lock_nested); - eb->lock_nested = 1; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers)) { - read_unlock(&eb->lock); - goto again; - } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); -} - -/* - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers - */ -int btrfs_try_tree_read_lock(struct extent_buffer *eb) -{ - if (atomic_read(&eb->blocking_writers)) - return 0; - - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers)) { - read_unlock(&eb->lock); - return 0; - } - atomic_inc(&eb->read_locks); - atomic_inc(&eb->spinning_readers); - return 1; -} - -/* - * returns 1 if we get the read lock and 0 if we don't - * this won't wait for blocking writers or readers - */ -int btrfs_try_tree_write_lock(struct extent_buffer *eb) -{ - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) - return 0; - write_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) { - write_unlock(&eb->lock); - return 0; - } - atomic_inc(&eb->write_locks); - atomic_inc(&eb->spinning_writers); - eb->lock_owner = current->pid; - return 1; -} - -/* - * drop a spinning read lock - */ -void btrfs_tree_read_unlock(struct extent_buffer *eb) -{ - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } - btrfs_assert_tree_read_locked(eb); - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); - atomic_dec(&eb->read_locks); - read_unlock(&eb->lock); -} - -/* - * drop a blocking read lock - */ -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) -{ - if (eb->lock_nested) { - read_lock(&eb->lock); - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = 0; - read_unlock(&eb->lock); - return; - } - read_unlock(&eb->lock); - } - btrfs_assert_tree_read_locked(eb); - WARN_ON(atomic_read(&eb->blocking_readers) == 0); - if (atomic_dec_and_test(&eb->blocking_readers)) - wake_up(&eb->read_lock_wq); - atomic_dec(&eb->read_locks); -} - -/* - * take a spinning write lock. This will wait for both - * blocking readers or writers - */ -void btrfs_tree_lock(struct extent_buffer *eb) -{ -again: - wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); - write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers)) { - write_unlock(&eb->lock); - wait_event(eb->read_lock_wq, - atomic_read(&eb->blocking_readers) == 0); - goto again; - } - if (atomic_read(&eb->blocking_writers)) { - write_unlock(&eb->lock); - wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); - goto again; - } - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); - atomic_inc(&eb->write_locks); - eb->lock_owner = current->pid; -} - -/* - * drop a spinning or a blocking write lock. - */ -void btrfs_tree_unlock(struct extent_buffer *eb) -{ - int blockers = atomic_read(&eb->blocking_writers); - - BUG_ON(blockers > 1); - - btrfs_assert_tree_locked(eb); - atomic_dec(&eb->write_locks); - - if (blockers) { - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_dec(&eb->blocking_writers); - smp_wmb(); - wake_up(&eb->write_lock_wq); - } else { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); - write_unlock(&eb->lock); - } -} - -void btrfs_assert_tree_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->write_locks)); -} - -void btrfs_assert_tree_read_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->read_locks)); -} diff --git a/ANDROID_3.4.5/fs/btrfs/locking.h b/ANDROID_3.4.5/fs/btrfs/locking.h deleted file mode 100644 index ca52681e..00000000 --- a/ANDROID_3.4.5/fs/btrfs/locking.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_LOCKING_ -#define __BTRFS_LOCKING_ - -#define BTRFS_WRITE_LOCK 1 -#define BTRFS_READ_LOCK 2 -#define BTRFS_WRITE_LOCK_BLOCKING 3 -#define BTRFS_READ_LOCK_BLOCKING 4 - -void btrfs_tree_lock(struct extent_buffer *eb); -void btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_try_spin_lock(struct extent_buffer *eb); - -void btrfs_tree_read_lock(struct extent_buffer *eb); -void btrfs_tree_read_unlock(struct extent_buffer *eb); -void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); -void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw); -void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); -void btrfs_assert_tree_locked(struct extent_buffer *eb); -int btrfs_try_tree_read_lock(struct extent_buffer *eb); -int btrfs_try_tree_write_lock(struct extent_buffer *eb); - -static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) -{ - if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING) - btrfs_tree_unlock(eb); - else if (rw == BTRFS_READ_LOCK_BLOCKING) - btrfs_tree_read_unlock_blocking(eb); - else if (rw == BTRFS_READ_LOCK) - btrfs_tree_read_unlock(eb); - else - BUG(); -} - -static inline void btrfs_set_lock_blocking(struct extent_buffer *eb) -{ - btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK); -} - -static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb) -{ - btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING); -} -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/lzo.c b/ANDROID_3.4.5/fs/btrfs/lzo.c deleted file mode 100644 index 743b86fa..00000000 --- a/ANDROID_3.4.5/fs/btrfs/lzo.c +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/err.h> -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/bio.h> -#include <linux/lzo.h> -#include "compression.h" - -#define LZO_LEN 4 - -struct workspace { - void *mem; - void *buf; /* where compressed data goes */ - void *cbuf; /* where decompressed data goes */ - struct list_head list; -}; - -static void lzo_free_workspace(struct list_head *ws) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - - vfree(workspace->buf); - vfree(workspace->cbuf); - vfree(workspace->mem); - kfree(workspace); -} - -static struct list_head *lzo_alloc_workspace(void) -{ - struct workspace *workspace; - - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) - return ERR_PTR(-ENOMEM); - - workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); - workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); - workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); - if (!workspace->mem || !workspace->buf || !workspace->cbuf) - goto fail; - - INIT_LIST_HEAD(&workspace->list); - - return &workspace->list; -fail: - lzo_free_workspace(&workspace->list); - return ERR_PTR(-ENOMEM); -} - -static inline void write_compress_length(char *buf, size_t len) -{ - __le32 dlen; - - dlen = cpu_to_le32(len); - memcpy(buf, &dlen, LZO_LEN); -} - -static inline size_t read_compress_length(char *buf) -{ - __le32 dlen; - - memcpy(&dlen, buf, LZO_LEN); - return le32_to_cpu(dlen); -} - -static int lzo_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret = 0; - char *data_in; - char *cpage_out; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; - unsigned long bytes_left; - - size_t in_len; - size_t out_len; - char *buf; - unsigned long tot_in = 0; - unsigned long tot_out = 0; - unsigned long pg_bytes_left; - unsigned long out_offset; - unsigned long bytes; - - *out_pages = 0; - *total_out = 0; - *total_in = 0; - - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); - data_in = kmap(in_page); - - /* - * store the size of all chunks of compressed data in - * the first 4 bytes - */ - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = kmap(out_page); - out_offset = LZO_LEN; - tot_out = LZO_LEN; - pages[0] = out_page; - nr_pages = 1; - pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; - - /* compress at most one page of data each time */ - in_len = min(len, PAGE_CACHE_SIZE); - while (tot_in < len) { - ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, - &out_len, workspace->mem); - if (ret != LZO_E_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", - ret); - ret = -1; - goto out; - } - - /* store the size of this chunk of compressed data */ - write_compress_length(cpage_out + out_offset, out_len); - tot_out += LZO_LEN; - out_offset += LZO_LEN; - pg_bytes_left -= LZO_LEN; - - tot_in += in_len; - tot_out += out_len; - - /* copy bytes from the working buffer into the pages */ - buf = workspace->cbuf; - while (out_len) { - bytes = min_t(unsigned long, pg_bytes_left, out_len); - - memcpy(cpage_out + out_offset, buf, bytes); - - out_len -= bytes; - pg_bytes_left -= bytes; - buf += bytes; - out_offset += bytes; - - /* - * we need another page for writing out. - * - * Note if there's less than 4 bytes left, we just - * skip to a new page. - */ - if ((out_len == 0 && pg_bytes_left < LZO_LEN) || - pg_bytes_left == 0) { - if (pg_bytes_left) { - memset(cpage_out + out_offset, 0, - pg_bytes_left); - tot_out += pg_bytes_left; - } - - /* we're done, don't allocate new page */ - if (out_len == 0 && tot_in >= len) - break; - - kunmap(out_page); - if (nr_pages == nr_dest_pages) { - out_page = NULL; - ret = -1; - goto out; - } - - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = kmap(out_page); - pages[nr_pages++] = out_page; - - pg_bytes_left = PAGE_CACHE_SIZE; - out_offset = 0; - } - } - - /* we're making it bigger, give up */ - if (tot_in > 8192 && tot_in < tot_out) - goto out; - - /* we're all done */ - if (tot_in >= len) - break; - - if (tot_out > max_out) - break; - - bytes_left = len - tot_in; - kunmap(in_page); - page_cache_release(in_page); - - start += PAGE_CACHE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); - data_in = kmap(in_page); - in_len = min(bytes_left, PAGE_CACHE_SIZE); - } - - if (tot_out > tot_in) - goto out; - - /* store the size of all chunks of compressed data */ - cpage_out = kmap(pages[0]); - write_compress_length(cpage_out, tot_out); - - kunmap(pages[0]); - - ret = 0; - *total_out = tot_out; - *total_in = tot_in; -out: - *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); - page_cache_release(in_page); - } - - return ret; -} - -static int lzo_decompress_biovec(struct list_head *ws, - struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret = 0, ret2; - char *data_in; - unsigned long page_in_index = 0; - unsigned long page_out_index = 0; - unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; - unsigned long buf_start; - unsigned long buf_offset = 0; - unsigned long bytes; - unsigned long working_bytes; - unsigned long pg_offset; - - size_t in_len; - size_t out_len; - unsigned long in_offset; - unsigned long in_page_bytes_left; - unsigned long tot_in; - unsigned long tot_out; - unsigned long tot_len; - char *buf; - bool may_late_unmap, need_unmap; - - data_in = kmap(pages_in[0]); - tot_len = read_compress_length(data_in); - - tot_in = LZO_LEN; - in_offset = LZO_LEN; - tot_len = min_t(size_t, srclen, tot_len); - in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; - - tot_out = 0; - pg_offset = 0; - - while (tot_in < tot_len) { - in_len = read_compress_length(data_in + in_offset); - in_page_bytes_left -= LZO_LEN; - in_offset += LZO_LEN; - tot_in += LZO_LEN; - - tot_in += in_len; - working_bytes = in_len; - may_late_unmap = need_unmap = false; - - /* fast path: avoid using the working buffer */ - if (in_page_bytes_left >= in_len) { - buf = data_in + in_offset; - bytes = in_len; - may_late_unmap = true; - goto cont; - } - - /* copy bytes from the pages into the working buffer */ - buf = workspace->cbuf; - buf_offset = 0; - while (working_bytes) { - bytes = min(working_bytes, in_page_bytes_left); - - memcpy(buf + buf_offset, data_in + in_offset, bytes); - buf_offset += bytes; -cont: - working_bytes -= bytes; - in_page_bytes_left -= bytes; - in_offset += bytes; - - /* check if we need to pick another page */ - if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) - || in_page_bytes_left == 0) { - tot_in += in_page_bytes_left; - - if (working_bytes == 0 && tot_in >= tot_len) - break; - - if (page_in_index + 1 >= total_pages_in) { - ret = -1; - goto done; - } - - if (may_late_unmap) - need_unmap = true; - else - kunmap(pages_in[page_in_index]); - - data_in = kmap(pages_in[++page_in_index]); - - in_page_bytes_left = PAGE_CACHE_SIZE; - in_offset = 0; - } - } - - out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); - ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, - &out_len); - if (need_unmap) - kunmap(pages_in[page_in_index - 1]); - if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed\n"); - ret = -1; - break; - } - - buf_start = tot_out; - tot_out += out_len; - - ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - tot_out, disk_start, - bvec, vcnt, - &page_out_index, &pg_offset); - if (ret2 == 0) - break; - } -done: - kunmap(pages_in[page_in_index]); - return ret; -} - -static int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - size_t in_len; - size_t out_len; - size_t tot_len; - int ret = 0; - char *kaddr; - unsigned long bytes; - - BUG_ON(srclen < LZO_LEN); - - tot_len = read_compress_length(data_in); - data_in += LZO_LEN; - - in_len = read_compress_length(data_in); - data_in += LZO_LEN; - - out_len = PAGE_CACHE_SIZE; - ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); - if (ret != LZO_E_OK) { - printk(KERN_WARNING "btrfs decompress failed!\n"); - ret = -1; - goto out; - } - - if (out_len < start_byte) { - ret = -1; - goto out; - } - - bytes = min_t(unsigned long, destlen, out_len - start_byte); - - kaddr = kmap_atomic(dest_page); - memcpy(kaddr, workspace->buf + start_byte, bytes); - kunmap_atomic(kaddr); -out: - return ret; -} - -struct btrfs_compress_op btrfs_lzo_compress = { - .alloc_workspace = lzo_alloc_workspace, - .free_workspace = lzo_free_workspace, - .compress_pages = lzo_compress_pages, - .decompress_biovec = lzo_decompress_biovec, - .decompress = lzo_decompress, -}; diff --git a/ANDROID_3.4.5/fs/btrfs/ordered-data.c b/ANDROID_3.4.5/fs/btrfs/ordered-data.c deleted file mode 100644 index bbf6d0d9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ordered-data.c +++ /dev/null @@ -1,977 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <linux/writeback.h> -#include <linux/pagevec.h> -#include "ctree.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "extent_io.h" - -static u64 entry_end(struct btrfs_ordered_extent *entry) -{ - if (entry->file_offset + entry->len < entry->file_offset) - return (u64)-1; - return entry->file_offset + entry->len; -} - -/* returns NULL if the insertion worked, or it returns the node it did find - * in the tree - */ -static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_ordered_extent *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); - - if (file_offset < entry->file_offset) - p = &(*p)->rb_left; - else if (file_offset >= entry_end(entry)) - p = &(*p)->rb_right; - else - return parent; - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static void ordered_data_tree_panic(struct inode *inode, int errno, - u64 offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", (unsigned long long)offset); -} - -/* - * look for a given offset in the tree, and if it can't be found return the - * first lesser offset - */ -static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, - struct rb_node **prev_ret) -{ - struct rb_node *n = root->rb_node; - struct rb_node *prev = NULL; - struct rb_node *test; - struct btrfs_ordered_extent *entry; - struct btrfs_ordered_extent *prev_entry = NULL; - - while (n) { - entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); - prev = n; - prev_entry = entry; - - if (file_offset < entry->file_offset) - n = n->rb_left; - else if (file_offset >= entry_end(entry)) - n = n->rb_right; - else - return n; - } - if (!prev_ret) - return NULL; - - while (prev && file_offset >= entry_end(prev_entry)) { - test = rb_next(prev); - if (!test) - break; - prev_entry = rb_entry(test, struct btrfs_ordered_extent, - rb_node); - if (file_offset < entry_end(prev_entry)) - break; - - prev = test; - } - if (prev) - prev_entry = rb_entry(prev, struct btrfs_ordered_extent, - rb_node); - while (prev && file_offset < entry_end(prev_entry)) { - test = rb_prev(prev); - if (!test) - break; - prev_entry = rb_entry(test, struct btrfs_ordered_extent, - rb_node); - prev = test; - } - *prev_ret = prev; - return NULL; -} - -/* - * helper to check if a given offset is inside a given entry - */ -static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) -{ - if (file_offset < entry->file_offset || - entry->file_offset + entry->len <= file_offset) - return 0; - return 1; -} - -static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, - u64 len) -{ - if (file_offset + len <= entry->file_offset || - entry->file_offset + entry->len <= file_offset) - return 0; - return 1; -} - -/* - * look find the first ordered struct that has this offset, otherwise - * the first one less than this offset - */ -static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - u64 file_offset) -{ - struct rb_root *root = &tree->tree; - struct rb_node *prev = NULL; - struct rb_node *ret; - struct btrfs_ordered_extent *entry; - - if (tree->last) { - entry = rb_entry(tree->last, struct btrfs_ordered_extent, - rb_node); - if (offset_in_entry(entry, file_offset)) - return tree->last; - } - ret = __tree_search(root, file_offset, &prev); - if (!ret) - ret = prev; - if (ret) - tree->last = ret; - return ret; -} - -/* allocate and add a new ordered_extent into the per-inode tree. - * file_offset is the logical offset in the file - * - * start is the disk block number of an extent already reserved in the - * extent allocation tree - * - * len is the length of the extent - * - * The tree is given a single reference on the ordered extent that was - * inserted. - */ -static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) -{ - struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry; - - tree = &BTRFS_I(inode)->ordered_tree; - entry = kzalloc(sizeof(*entry), GFP_NOFS); - if (!entry) - return -ENOMEM; - - entry->file_offset = file_offset; - entry->start = start; - entry->len = len; - entry->disk_len = disk_len; - entry->bytes_left = len; - entry->inode = inode; - entry->compress_type = compress_type; - if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) - set_bit(type, &entry->flags); - - if (dio) - set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - - /* one ref for the tree */ - atomic_set(&entry->refs, 1); - init_waitqueue_head(&entry->wait); - INIT_LIST_HEAD(&entry->list); - INIT_LIST_HEAD(&entry->root_extent_list); - - trace_btrfs_ordered_extent_add(inode, entry); - - spin_lock(&tree->lock); - node = tree_insert(&tree->tree, file_offset, - &entry->rb_node); - if (node) - ordered_data_tree_panic(inode, -EEXIST, file_offset); - spin_unlock(&tree->lock); - - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); - list_add_tail(&entry->root_extent_list, - &BTRFS_I(inode)->root->fs_info->ordered_extents); - spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); - - return 0; -} - -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) -{ - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) -{ - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1, - BTRFS_COMPRESS_NONE); -} - -int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type) -{ - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, - compress_type); -} - -/* - * Add a struct btrfs_ordered_sum into the list of checksums to be inserted - * when an ordered extent is finished. If the list covers more than one - * ordered extent, it is split across multiples. - */ -void btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum) -{ - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - list_add_tail(&sum->list, &entry->list); - spin_unlock(&tree->lock); -} - -/* - * this is used to account for finished IO across a given range - * of the file. The IO may span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. - * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. - * - * file_offset is updated to one byte past the range that is recorded as - * complete. This allows you to walk forward in the file. - */ -int btrfs_dec_test_first_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size) -{ - struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry = NULL; - int ret; - u64 dec_end; - u64 dec_start; - u64 to_dec; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - node = tree_search(tree, *file_offset); - if (!node) { - ret = 1; - goto out; - } - - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) { - ret = 1; - goto out; - } - - dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, entry->file_offset + - entry->len); - *file_offset = dec_end; - if (dec_start > dec_end) { - printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", - (unsigned long long)dec_start, - (unsigned long long)dec_end); - } - to_dec = dec_end - dec_start; - if (to_dec > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)to_dec); - } - entry->bytes_left -= to_dec; - if (entry->bytes_left == 0) - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else - ret = 1; -out: - if (!ret && cached && entry) { - *cached = entry; - atomic_inc(&entry->refs); - } - spin_unlock(&tree->lock); - return ret == 0; -} - -/* - * this is used to account for finished IO across a given range - * of the file. The IO should not span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. - * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. - */ -int btrfs_dec_test_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size) -{ - struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry = NULL; - int ret; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - node = tree_search(tree, file_offset); - if (!node) { - ret = 1; - goto out; - } - - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, file_offset)) { - ret = 1; - goto out; - } - - if (io_size > entry->bytes_left) { - printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)io_size); - } - entry->bytes_left -= io_size; - if (entry->bytes_left == 0) - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - else - ret = 1; -out: - if (!ret && cached && entry) { - *cached = entry; - atomic_inc(&entry->refs); - } - spin_unlock(&tree->lock); - return ret == 0; -} - -/* - * used to drop a reference on an ordered extent. This will free - * the extent if the last reference is dropped - */ -void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) -{ - struct list_head *cur; - struct btrfs_ordered_sum *sum; - - trace_btrfs_ordered_extent_put(entry->inode, entry); - - if (atomic_dec_and_test(&entry->refs)) { - while (!list_empty(&entry->list)) { - cur = entry->list.next; - sum = list_entry(cur, struct btrfs_ordered_sum, list); - list_del(&sum->list); - kfree(sum); - } - kfree(entry); - } -} - -/* - * remove an ordered extent from the tree. No references are dropped - * and you must wake_up entry->wait. You must hold the tree lock - * while you call this function. - */ -static void __btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - struct btrfs_ordered_inode_tree *tree; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct rb_node *node; - - tree = &BTRFS_I(inode)->ordered_tree; - node = &entry->rb_node; - rb_erase(node, &tree->tree); - tree->last = NULL; - set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - - spin_lock(&root->fs_info->ordered_extent_lock); - list_del_init(&entry->root_extent_list); - - trace_btrfs_ordered_extent_remove(inode, entry); - - /* - * we have no more ordered extents for this inode and - * no dirty pages. We can safely remove it from the - * list of ordered extents - */ - if (RB_EMPTY_ROOT(&tree->tree) && - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { - list_del_init(&BTRFS_I(inode)->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * remove an ordered extent from the tree. No references are dropped - * but any waiters are woken. - */ -void btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry) -{ - struct btrfs_ordered_inode_tree *tree; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - __btrfs_remove_ordered_extent(inode, entry); - spin_unlock(&tree->lock); - wake_up(&entry->wait); -} - -/* - * wait for all the ordered extents in a root. This is done when balancing - * space between drives. - */ -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) -{ - struct list_head splice; - struct list_head *cur; - struct btrfs_ordered_extent *ordered; - struct inode *inode; - - INIT_LIST_HEAD(&splice); - - spin_lock(&root->fs_info->ordered_extent_lock); - list_splice_init(&root->fs_info->ordered_extents, &splice); - while (!list_empty(&splice)) { - cur = splice.next; - ordered = list_entry(cur, struct btrfs_ordered_extent, - root_extent_list); - if (nocow_only && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - list_move(&ordered->root_extent_list, - &root->fs_info->ordered_extents); - cond_resched_lock(&root->fs_info->ordered_extent_lock); - continue; - } - - list_del_init(&ordered->root_extent_list); - atomic_inc(&ordered->refs); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(ordered->inode); - - spin_unlock(&root->fs_info->ordered_extent_lock); - - if (inode) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); - } else { - btrfs_put_ordered_extent(ordered); - } - - spin_lock(&root->fs_info->ordered_extent_lock); - } - spin_unlock(&root->fs_info->ordered_extent_lock); -} - -/* - * this is used during transaction commit to write all the inodes - * added to the ordered operation list. These files must be fully on - * disk before the transaction commits. - * - * we have two modes here, one is to just start the IO via filemap_flush - * and the other is to wait for all the io. When we wait, we have an - * extra check to make sure the ordered operation list really is empty - * before we return - */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) -{ - struct btrfs_inode *btrfs_inode; - struct inode *inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_extent_lock); -again: - list_splice_init(&root->fs_info->ordered_operations, &splice); - - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - inode = &btrfs_inode->vfs_inode; - - list_del_init(&btrfs_inode->ordered_operations); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(inode); - - if (!wait && inode) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_extent_lock); - - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); - } - - cond_resched(); - spin_lock(&root->fs_info->ordered_extent_lock); - } - if (wait && !list_empty(&root->fs_info->ordered_operations)) - goto again; - - spin_unlock(&root->fs_info->ordered_extent_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); -} - -/* - * Used to start IO or wait for a given ordered extent to finish. - * - * If wait is one, this effectively waits on page writeback for all the pages - * in the extent, and it waits on the io completion code to insert - * metadata into the btree corresponding to the extent - */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) -{ - u64 start = entry->file_offset; - u64 end = start + entry->len - 1; - - trace_btrfs_ordered_extent_start(inode, entry); - - /* - * pages in the range can be dirty, clean or writeback. We - * start IO on any dirty ones so the wait doesn't stall waiting - * for pdflush to find them - */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); - if (wait) { - wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &entry->flags)); - } -} - -/* - * Used to wait on ordered extents across a large range of bytes. - */ -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) -{ - u64 end; - u64 orig_end; - struct btrfs_ordered_extent *ordered; - int found; - - if (start + len < start) { - orig_end = INT_LIMIT(loff_t); - } else { - orig_end = start + len - 1; - if (orig_end > INT_LIMIT(loff_t)) - orig_end = INT_LIMIT(loff_t); - } -again: - /* start IO across the range first to instantiate any delalloc - * extents - */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - /* The compression code will leave pages locked but return from - * writepage without setting the page writeback. Starting again - * with WB_SYNC_ALL will end up waiting for the IO to actually start. - */ - filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - - filemap_fdatawait_range(inode->i_mapping, start, orig_end); - - end = orig_end; - found = 0; - while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); - if (!ordered) - break; - if (ordered->file_offset > orig_end) { - btrfs_put_ordered_extent(ordered); - break; - } - if (ordered->file_offset + ordered->len < start) { - btrfs_put_ordered_extent(ordered); - break; - } - found++; - btrfs_start_ordered_extent(inode, ordered, 1); - end = ordered->file_offset; - btrfs_put_ordered_extent(ordered); - if (end == 0 || end == start) - break; - end--; - } - if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_DELALLOC, 0, NULL)) { - schedule_timeout(1); - goto again; - } -} - -/* - * find an ordered extent corresponding to file_offset. return NULL if - * nothing is found, otherwise take a reference on the extent and return it - */ -struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, - u64 file_offset) -{ - struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry = NULL; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - node = tree_search(tree, file_offset); - if (!node) - goto out; - - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, file_offset)) - entry = NULL; - if (entry) - atomic_inc(&entry->refs); -out: - spin_unlock(&tree->lock); - return entry; -} - -/* Since the DIO code tries to lock a wide area we need to look for any ordered - * extents that exist in the range, rather than just the start of the range. - */ -struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, - u64 file_offset, - u64 len) -{ - struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry = NULL; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - node = tree_search(tree, file_offset); - if (!node) { - node = tree_search(tree, file_offset + len); - if (!node) - goto out; - } - - while (1) { - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (range_overlaps(entry, file_offset, len)) - break; - - if (entry->file_offset >= file_offset + len) { - entry = NULL; - break; - } - entry = NULL; - node = rb_next(node); - if (!node) - break; - } -out: - if (entry) - atomic_inc(&entry->refs); - spin_unlock(&tree->lock); - return entry; -} - -/* - * lookup and return any extent before 'file_offset'. NULL is returned - * if none is found - */ -struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) -{ - struct btrfs_ordered_inode_tree *tree; - struct rb_node *node; - struct btrfs_ordered_extent *entry = NULL; - - tree = &BTRFS_I(inode)->ordered_tree; - spin_lock(&tree->lock); - node = tree_search(tree, file_offset); - if (!node) - goto out; - - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); -out: - spin_unlock(&tree->lock); - return entry; -} - -/* - * After an extent is done, call this to conditionally update the on disk - * i_size. i_size is updated to cover any fully written part of the file. - */ -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - u64 disk_i_size; - u64 new_i_size; - u64 i_size_test; - u64 i_size = i_size_read(inode); - struct rb_node *node; - struct rb_node *prev = NULL; - struct btrfs_ordered_extent *test; - int ret = 1; - - if (ordered) - offset = entry_end(ordered); - else - offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - - spin_lock(&tree->lock); - disk_i_size = BTRFS_I(inode)->disk_i_size; - - /* truncate file */ - if (disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = i_size; - ret = 0; - goto out; - } - - /* - * if the disk i_size is already at the inode->i_size, or - * this ordered extent is inside the disk i_size, we're done - */ - if (disk_i_size == i_size || offset <= disk_i_size) { - goto out; - } - - /* - * we can't update the disk_isize if there are delalloc bytes - * between disk_i_size and this ordered extent - */ - if (test_range_bit(io_tree, disk_i_size, offset - 1, - EXTENT_DELALLOC, 0, NULL)) { - goto out; - } - /* - * walk backward from this ordered extent to disk_i_size. - * if we find an ordered extent then we can't update disk i_size - * yet - */ - if (ordered) { - node = rb_prev(&ordered->rb_node); - } else { - prev = tree_search(tree, offset); - /* - * we insert file extents without involving ordered struct, - * so there should be no ordered struct cover this offset - */ - if (prev) { - test = rb_entry(prev, struct btrfs_ordered_extent, - rb_node); - BUG_ON(offset_in_entry(test, offset)); - } - node = prev; - } - while (node) { - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset + test->len <= disk_i_size) - break; - if (test->file_offset >= i_size) - break; - if (test->file_offset >= disk_i_size) - goto out; - node = rb_prev(node); - } - new_i_size = min_t(u64, offset, i_size); - - /* - * at this point, we know we can safely update i_size to at least - * the offset from this ordered extent. But, we need to - * walk forward and see if ios from higher up in the file have - * finished. - */ - if (ordered) { - node = rb_next(&ordered->rb_node); - } else { - if (prev) - node = rb_next(prev); - else - node = rb_first(&tree->tree); - } - i_size_test = 0; - if (node) { - /* - * do we have an area where IO might have finished - * between our ordered extent and the next one. - */ - test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > offset) - i_size_test = test->file_offset; - } else { - i_size_test = i_size; - } - - /* - * i_size_test is the end of a region after this ordered - * extent where there are no ordered extents. As long as there - * are no delalloc bytes in this area, it is safe to update - * disk_i_size to the end of the region. - */ - if (i_size_test > offset && - !test_range_bit(io_tree, offset, i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { - new_i_size = min_t(u64, i_size_test, i_size); - } - BTRFS_I(inode)->disk_i_size = new_i_size; - ret = 0; -out: - /* - * we need to remove the ordered extent with the tree lock held - * so that other people calling this function don't find our fully - * processed ordered entry and skip updating the i_size - */ - if (ordered) - __btrfs_remove_ordered_extent(inode, ordered); - spin_unlock(&tree->lock); - if (ordered) - wake_up(&ordered->wait); - return ret; -} - -/* - * search the ordered extents for one corresponding to 'offset' and - * try to find a checksum. This is used because we allow pages to - * be reclaimed before their checksum is actually put into the btree - */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum) -{ - struct btrfs_ordered_sum *ordered_sum; - struct btrfs_sector_sum *sector_sums; - struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - unsigned long num_sectors; - unsigned long i; - u32 sectorsize = BTRFS_I(inode)->root->sectorsize; - int ret = 1; - - ordered = btrfs_lookup_ordered_extent(inode, offset); - if (!ordered) - return 1; - - spin_lock(&tree->lock); - list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { - if (disk_bytenr >= ordered_sum->bytenr) { - num_sectors = ordered_sum->len / sectorsize; - sector_sums = ordered_sum->sums; - for (i = 0; i < num_sectors; i++) { - if (sector_sums[i].bytenr == disk_bytenr) { - *sum = sector_sums[i].sum; - ret = 0; - goto out; - } - } - } - } -out: - spin_unlock(&tree->lock); - btrfs_put_ordered_extent(ordered); - return ret; -} - - -/* - * add a given inode to the list of inodes that must be fully on - * disk before a transaction commit finishes. - * - * This basically gives us the ext3 style data=ordered mode, and it is mostly - * used to make sure renamed files are fully on disk. - * - * It is a noop if the inode is already fully on disk. - * - * If trans is not null, we'll do a friendly check for a transaction that - * is already flushing things and force the IO down ourselves. - */ -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - u64 last_mod; - - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); - - /* - * if this file hasn't been changed since the last transaction - * commit, we can safely return without doing anything - */ - if (last_mod < root->fs_info->last_trans_committed) - return; - - /* - * the transaction is already committing. Just start the IO and - * don't bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return; - } - - spin_lock(&root->fs_info->ordered_extent_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &root->fs_info->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_extent_lock); -} diff --git a/ANDROID_3.4.5/fs/btrfs/ordered-data.h b/ANDROID_3.4.5/fs/btrfs/ordered-data.h deleted file mode 100644 index c355ad4d..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ordered-data.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_ORDERED_DATA__ -#define __BTRFS_ORDERED_DATA__ - -/* one of these per inode */ -struct btrfs_ordered_inode_tree { - spinlock_t lock; - struct rb_root tree; - struct rb_node *last; -}; - -/* - * these are used to collect checksums done just before bios submission. - * They are attached via a list into the ordered extent, and - * checksum items are inserted into the tree after all the blocks in - * the ordered extent are on disk - */ -struct btrfs_sector_sum { - /* bytenr on disk */ - u64 bytenr; - u32 sum; -}; - -struct btrfs_ordered_sum { - /* bytenr is the start of this extent on disk */ - u64 bytenr; - - /* - * this is the length in bytes covered by the sums array below. - */ - unsigned long len; - struct list_head list; - /* last field is a variable length array of btrfs_sector_sums */ - struct btrfs_sector_sum sums[]; -}; - -/* - * bits for the flags field: - * - * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. - * It is used to make sure metadata is inserted into the tree only once - * per extent. - * - * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the - * rbtree, just before waking any waiters. It is used to indicate the - * IO is done and any metadata is inserted into the tree. - */ -#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ - -#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ - -#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ - -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ - -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ - -#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ - -struct btrfs_ordered_extent { - /* logical offset in the file */ - u64 file_offset; - - /* disk byte number */ - u64 start; - - /* ram length of the extent in bytes */ - u64 len; - - /* extent length on disk */ - u64 disk_len; - - /* number of bytes that still need writing */ - u64 bytes_left; - - /* flags (described above) */ - unsigned long flags; - - /* compression algorithm */ - int compress_type; - - /* reference count */ - atomic_t refs; - - /* the inode we belong to */ - struct inode *inode; - - /* list of checksums for insertion when the extent io is done */ - struct list_head list; - - /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ - wait_queue_head_t wait; - - /* our friendly rbtree entry */ - struct rb_node rb_node; - - /* a per root list of all the pending ordered extents */ - struct list_head root_extent_list; -}; - - -/* - * calculates the total size you need to allocate for an ordered sum - * structure spanning 'bytes' in the file - */ -static inline int btrfs_ordered_sum_size(struct btrfs_root *root, - unsigned long bytes) -{ - unsigned long num_sectors = (bytes + root->sectorsize - 1) / - root->sectorsize; - num_sectors++; - return sizeof(struct btrfs_ordered_sum) + - num_sectors * sizeof(struct btrfs_sector_sum); -} - -static inline void -btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) -{ - spin_lock_init(&t->lock); - t->tree = RB_ROOT; - t->last = NULL; -} - -void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size); -int btrfs_dec_test_first_ordered_pending(struct inode *inode, - struct btrfs_ordered_extent **cached, - u64 *file_offset, u64 io_size); -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); -int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); -int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type); -void btrfs_add_ordered_sum(struct inode *inode, - struct btrfs_ordered_extent *entry, - struct btrfs_ordered_sum *sum); -struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, - u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); -void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); -struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, - u64 file_offset, - u64 len); -int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, - struct btrfs_ordered_extent *ordered); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/orphan.c b/ANDROID_3.4.5/fs/btrfs/orphan.c deleted file mode 100644 index 24cad169..00000000 --- a/ANDROID_3.4.5/fs/btrfs/orphan.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "ctree.h" -#include "disk-io.h" - -int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret = 0; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; -} - -int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret = 0; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); - -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = offset; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - btrfs_free_path(path); - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/print-tree.c b/ANDROID_3.4.5/fs/btrfs/print-tree.c deleted file mode 100644 index f38e4524..00000000 --- a/ANDROID_3.4.5/fs/btrfs/print-tree.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" - -static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) -{ - int num_stripes = btrfs_chunk_num_stripes(eb, chunk); - int i; - printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " - "num_stripes %d\n", - (unsigned long long)btrfs_chunk_length(eb, chunk), - (unsigned long long)btrfs_chunk_owner(eb, chunk), - (unsigned long long)btrfs_chunk_type(eb, chunk), - num_stripes); - for (i = 0 ; i < num_stripes ; i++) { - printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, - (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), - (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); - } -} -static void print_dev_item(struct extent_buffer *eb, - struct btrfs_dev_item *dev_item) -{ - printk(KERN_INFO "\t\tdev item devid %llu " - "total_bytes %llu bytes used %llu\n", - (unsigned long long)btrfs_device_id(eb, dev_item), - (unsigned long long)btrfs_device_total_bytes(eb, dev_item), - (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); -} -static void print_extent_data_ref(struct extent_buffer *eb, - struct btrfs_extent_data_ref *ref) -{ - printk(KERN_INFO "\t\textent data backref root %llu " - "objectid %llu offset %llu count %u\n", - (unsigned long long)btrfs_extent_data_ref_root(eb, ref), - (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), - (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), - btrfs_extent_data_ref_count(eb, ref)); -} - -static void print_extent_item(struct extent_buffer *eb, int slot) -{ - struct btrfs_extent_item *ei; - struct btrfs_extent_inline_ref *iref; - struct btrfs_extent_data_ref *dref; - struct btrfs_shared_data_ref *sref; - struct btrfs_disk_key key; - unsigned long end; - unsigned long ptr; - int type; - u32 item_size = btrfs_item_size_nr(eb, slot); - u64 flags; - u64 offset; - - if (item_size < sizeof(*ei)) { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); - printk(KERN_INFO "\t\textent refs %u\n", - btrfs_extent_refs_v0(eb, ei0)); - return; -#else - BUG(); -#endif - } - - ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); - flags = btrfs_extent_flags(eb, ei); - - printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", - (unsigned long long)btrfs_extent_refs(eb, ei), - (unsigned long long)btrfs_extent_generation(eb, ei), - (unsigned long long)flags); - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - struct btrfs_tree_block_info *info; - info = (struct btrfs_tree_block_info *)(ei + 1); - btrfs_tree_block_key(eb, info, &key); - printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " - "level %d\n", - (unsigned long long)btrfs_disk_key_objectid(&key), - key.type, - (unsigned long long)btrfs_disk_key_offset(&key), - btrfs_tree_block_level(eb, info)); - iref = (struct btrfs_extent_inline_ref *)(info + 1); - } else { - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - } - - ptr = (unsigned long)iref; - end = (unsigned long)ei + item_size; - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(eb, iref); - offset = btrfs_extent_inline_ref_offset(eb, iref); - switch (type) { - case BTRFS_TREE_BLOCK_REF_KEY: - printk(KERN_INFO "\t\ttree block backref " - "root %llu\n", (unsigned long long)offset); - break; - case BTRFS_SHARED_BLOCK_REF_KEY: - printk(KERN_INFO "\t\tshared block backref " - "parent %llu\n", (unsigned long long)offset); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - print_extent_data_ref(eb, dref); - break; - case BTRFS_SHARED_DATA_REF_KEY: - sref = (struct btrfs_shared_data_ref *)(iref + 1); - printk(KERN_INFO "\t\tshared data backref " - "parent %llu count %u\n", - (unsigned long long)offset, - btrfs_shared_data_ref_count(eb, sref)); - break; - default: - BUG(); - } - ptr += btrfs_extent_inline_ref_size(type); - } - WARN_ON(ptr > end); -} - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static void print_extent_ref_v0(struct extent_buffer *eb, int slot) -{ - struct btrfs_extent_ref_v0 *ref0; - - ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); - printk("\t\textent back ref root %llu gen %llu " - "owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root_v0(eb, ref0), - (unsigned long long)btrfs_ref_generation_v0(eb, ref0), - (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), - (unsigned long)btrfs_ref_count_v0(eb, ref0)); -} -#endif - -void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) -{ - int i; - u32 type, nr; - struct btrfs_item *item; - struct btrfs_root_item *ri; - struct btrfs_dir_item *di; - struct btrfs_inode_item *ii; - struct btrfs_block_group_item *bi; - struct btrfs_file_extent_item *fi; - struct btrfs_extent_data_ref *dref; - struct btrfs_shared_data_ref *sref; - struct btrfs_dev_extent *dev_extent; - struct btrfs_key key; - struct btrfs_key found_key; - - if (!l) - return; - - nr = btrfs_header_nritems(l); - - printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", - (unsigned long long)btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(root, l)); - for (i = 0 ; i < nr ; i++) { - item = btrfs_item_nr(l, i); - btrfs_item_key_to_cpu(l, &key, i); - type = btrfs_key_type(&key); - printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " - "itemsize %d\n", - i, - (unsigned long long)key.objectid, type, - (unsigned long long)key.offset, - btrfs_item_offset(l, item), btrfs_item_size(l, item)); - switch (type) { - case BTRFS_INODE_ITEM_KEY: - ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); - printk(KERN_INFO "\t\tinode generation %llu size %llu " - "mode %o\n", - (unsigned long long) - btrfs_inode_generation(l, ii), - (unsigned long long)btrfs_inode_size(l, ii), - btrfs_inode_mode(l, ii)); - break; - case BTRFS_DIR_ITEM_KEY: - di = btrfs_item_ptr(l, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(l, di, &found_key); - printk(KERN_INFO "\t\tdir oid %llu type %u\n", - (unsigned long long)found_key.objectid, - btrfs_dir_type(l, di)); - break; - case BTRFS_ROOT_ITEM_KEY: - ri = btrfs_item_ptr(l, i, struct btrfs_root_item); - printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", - (unsigned long long) - btrfs_disk_root_bytenr(l, ri), - btrfs_disk_root_refs(l, ri)); - break; - case BTRFS_EXTENT_ITEM_KEY: - print_extent_item(l, i); - break; - case BTRFS_TREE_BLOCK_REF_KEY: - printk(KERN_INFO "\t\ttree block backref\n"); - break; - case BTRFS_SHARED_BLOCK_REF_KEY: - printk(KERN_INFO "\t\tshared block backref\n"); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - dref = btrfs_item_ptr(l, i, - struct btrfs_extent_data_ref); - print_extent_data_ref(l, dref); - break; - case BTRFS_SHARED_DATA_REF_KEY: - sref = btrfs_item_ptr(l, i, - struct btrfs_shared_data_ref); - printk(KERN_INFO "\t\tshared data backref count %u\n", - btrfs_shared_data_ref_count(l, sref)); - break; - case BTRFS_EXTENT_DATA_KEY: - fi = btrfs_item_ptr(l, i, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(l, fi) == - BTRFS_FILE_EXTENT_INLINE) { - printk(KERN_INFO "\t\tinline extent data " - "size %u\n", - btrfs_file_extent_inline_len(l, fi)); - break; - } - printk(KERN_INFO "\t\textent data disk bytenr %llu " - "nr %llu\n", - (unsigned long long) - btrfs_file_extent_disk_bytenr(l, fi), - (unsigned long long) - btrfs_file_extent_disk_num_bytes(l, fi)); - printk(KERN_INFO "\t\textent data offset %llu " - "nr %llu ram %llu\n", - (unsigned long long) - btrfs_file_extent_offset(l, fi), - (unsigned long long) - btrfs_file_extent_num_bytes(l, fi), - (unsigned long long) - btrfs_file_extent_ram_bytes(l, fi)); - break; - case BTRFS_EXTENT_REF_V0_KEY: -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - print_extent_ref_v0(l, i); -#else - BUG(); -#endif - break; - case BTRFS_BLOCK_GROUP_ITEM_KEY: - bi = btrfs_item_ptr(l, i, - struct btrfs_block_group_item); - printk(KERN_INFO "\t\tblock group used %llu\n", - (unsigned long long) - btrfs_disk_block_group_used(l, bi)); - break; - case BTRFS_CHUNK_ITEM_KEY: - print_chunk(l, btrfs_item_ptr(l, i, - struct btrfs_chunk)); - break; - case BTRFS_DEV_ITEM_KEY: - print_dev_item(l, btrfs_item_ptr(l, i, - struct btrfs_dev_item)); - break; - case BTRFS_DEV_EXTENT_KEY: - dev_extent = btrfs_item_ptr(l, i, - struct btrfs_dev_extent); - printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" - "\t\tchunk objectid %llu chunk offset %llu " - "length %llu\n", - (unsigned long long) - btrfs_dev_extent_chunk_tree(l, dev_extent), - (unsigned long long) - btrfs_dev_extent_chunk_objectid(l, dev_extent), - (unsigned long long) - btrfs_dev_extent_chunk_offset(l, dev_extent), - (unsigned long long) - btrfs_dev_extent_length(l, dev_extent)); - }; - } -} - -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) -{ - int i; u32 nr; - struct btrfs_key key; - int level; - - if (!c) - return; - nr = btrfs_header_nritems(c); - level = btrfs_header_level(c); - if (level == 0) { - btrfs_print_leaf(root, c); - return; - } - printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", - (unsigned long long)btrfs_header_bytenr(c), - level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); - for (i = 0; i < nr; i++) { - btrfs_node_key_to_cpu(c, &key, i); - printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", - i, - (unsigned long long)key.objectid, - key.type, - (unsigned long long)key.offset, - (unsigned long long)btrfs_node_blockptr(c, i)); - } - for (i = 0; i < nr; i++) { - struct extent_buffer *next = read_tree_block(root, - btrfs_node_blockptr(c, i), - btrfs_level_size(root, level - 1), - btrfs_node_ptr_generation(c, i)); - if (btrfs_is_leaf(next) && - level != 1) - BUG(); - if (btrfs_header_level(next) != - level - 1) - BUG(); - btrfs_print_tree(root, next); - free_extent_buffer(next); - } -} diff --git a/ANDROID_3.4.5/fs/btrfs/print-tree.h b/ANDROID_3.4.5/fs/btrfs/print-tree.h deleted file mode 100644 index da75efe5..00000000 --- a/ANDROID_3.4.5/fs/btrfs/print-tree.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __PRINT_TREE_ -#define __PRINT_TREE_ -void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/reada.c b/ANDROID_3.4.5/fs/btrfs/reada.c deleted file mode 100644 index ac5d0108..00000000 --- a/ANDROID_3.4.5/fs/btrfs/reada.c +++ /dev/null @@ -1,961 +0,0 @@ -/* - * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/workqueue.h> -#include "ctree.h" -#include "volumes.h" -#include "disk-io.h" -#include "transaction.h" - -#undef DEBUG - -/* - * This is the implementation for the generic read ahead framework. - * - * To trigger a readahead, btrfs_reada_add must be called. It will start - * a read ahead for the given range [start, end) on tree root. The returned - * handle can either be used to wait on the readahead to finish - * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). - * - * The read ahead works as follows: - * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. - * reada_start_machine will then search for extents to prefetch and trigger - * some reads. When a read finishes for a node, all contained node/leaf - * pointers that lie in the given range will also be enqueued. The reads will - * be triggered in sequential order, thus giving a big win over a naive - * enumeration. It will also make use of multi-device layouts. Each disk - * will have its on read pointer and all disks will by utilized in parallel. - * Also will no two disks read both sides of a mirror simultaneously, as this - * would waste seeking capacity. Instead both disks will read different parts - * of the filesystem. - * Any number of readaheads can be started in parallel. The read order will be - * determined globally, i.e. 2 parallel readaheads will normally finish faster - * than the 2 started one after another. - */ - -#define MAX_IN_FLIGHT 6 - -struct reada_extctl { - struct list_head list; - struct reada_control *rc; - u64 generation; -}; - -struct reada_extent { - u64 logical; - struct btrfs_key top; - u32 blocksize; - int err; - struct list_head extctl; - struct kref refcnt; - spinlock_t lock; - struct reada_zone *zones[BTRFS_MAX_MIRRORS]; - int nzones; - struct btrfs_device *scheduled_for; -}; - -struct reada_zone { - u64 start; - u64 end; - u64 elems; - struct list_head list; - spinlock_t lock; - int locked; - struct btrfs_device *device; - struct btrfs_device *devs[BTRFS_MAX_MIRRORS]; /* full list, incl - * self */ - int ndevs; - struct kref refcnt; -}; - -struct reada_machine_work { - struct btrfs_work work; - struct btrfs_fs_info *fs_info; -}; - -static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); -static void reada_control_release(struct kref *kref); -static void reada_zone_release(struct kref *kref); -static void reada_start_machine(struct btrfs_fs_info *fs_info); -static void __reada_start_machine(struct btrfs_fs_info *fs_info); - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation); - -/* recurses */ -/* in case of err, eb might be NULL */ -static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) -{ - int level = 0; - int nritems; - int i; - u64 bytenr; - u64 generation; - struct reada_extent *re; - struct btrfs_fs_info *fs_info = root->fs_info; - struct list_head list; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct btrfs_device *for_dev; - - if (eb) - level = btrfs_header_level(eb); - - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - kref_get(&re->refcnt); - spin_unlock(&fs_info->reada_lock); - - if (!re) - return -1; - - spin_lock(&re->lock); - /* - * just take the full list from the extent. afterwards we - * don't need the lock anymore - */ - list_replace_init(&re->extctl, &list); - for_dev = re->scheduled_for; - re->scheduled_for = NULL; - spin_unlock(&re->lock); - - if (err == 0) { - nritems = level ? btrfs_header_nritems(eb) : 0; - generation = btrfs_header_generation(eb); - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - } else { - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - nritems = 0; - generation = 0; - } - - for (i = 0; i < nritems; i++) { - struct reada_extctl *rec; - u64 n_gen; - struct btrfs_key key; - struct btrfs_key next_key; - - btrfs_node_key_to_cpu(eb, &key, i); - if (i + 1 < nritems) - btrfs_node_key_to_cpu(eb, &next_key, i + 1); - else - next_key = re->top; - bytenr = btrfs_node_blockptr(eb, i); - n_gen = btrfs_node_ptr_generation(eb, i); - - list_for_each_entry(rec, &list, list) { - struct reada_control *rc = rec->rc; - - /* - * if the generation doesn't match, just ignore this - * extctl. This will probably cut off a branch from - * prefetch. Alternatively one could start a new (sub-) - * prefetch for this branch, starting again from root. - * FIXME: move the generation check out of this loop - */ -#ifdef DEBUG - if (rec->generation != generation) { - printk(KERN_DEBUG "generation mismatch for " - "(%llu,%d,%llu) %llu != %llu\n", - key.objectid, key.type, key.offset, - rec->generation, generation); - } -#endif - if (rec->generation == generation && - btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && - btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - level - 1, n_gen); - } - } - /* - * free extctl records - */ - while (!list_empty(&list)) { - struct reada_control *rc; - struct reada_extctl *rec; - - rec = list_first_entry(&list, struct reada_extctl, list); - list_del(&rec->list); - rc = rec->rc; - kfree(rec); - - kref_get(&rc->refcnt); - if (atomic_dec_and_test(&rc->elems)) { - kref_put(&rc->refcnt, reada_control_release); - wake_up(&rc->wait); - } - kref_put(&rc->refcnt, reada_control_release); - - reada_extent_put(fs_info, re); /* one ref for each entry */ - } - reada_extent_put(fs_info, re); /* our ref */ - if (for_dev) - atomic_dec(&for_dev->reada_in_flight); - - return 0; -} - -/* - * start is passed separately in case eb in NULL, which may be the case with - * failed I/O - */ -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) -{ - int ret; - - ret = __readahead_hook(root, eb, start, err); - - reada_start_machine(root->fs_info); - - return ret; -} - -static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, u64 logical, - struct btrfs_bio *bbio) -{ - int ret; - struct reada_zone *zone; - struct btrfs_block_group_cache *cache = NULL; - u64 start; - u64 end; - int i; - - zone = NULL; - spin_lock(&fs_info->reada_lock); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) - kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - - if (ret == 1) { - if (logical >= zone->start && logical < zone->end) - return zone; - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - - cache = btrfs_lookup_block_group(fs_info, logical); - if (!cache) - return NULL; - - start = cache->key.objectid; - end = start + cache->key.offset - 1; - btrfs_put_block_group(cache); - - zone = kzalloc(sizeof(*zone), GFP_NOFS); - if (!zone) - return NULL; - - zone->start = start; - zone->end = end; - INIT_LIST_HEAD(&zone->list); - spin_lock_init(&zone->lock); - zone->locked = 0; - kref_init(&zone->refcnt); - zone->elems = 0; - zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < bbio->num_stripes; ++i) { - /* bounds have already been checked */ - zone->devs[i] = bbio->stripes[i].dev; - } - zone->ndevs = bbio->num_stripes; - - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), - zone); - - if (ret == -EEXIST) { - kfree(zone); - ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, - logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) - kref_get(&zone->refcnt); - } - spin_unlock(&fs_info->reada_lock); - - return zone; -} - -static struct reada_extent *reada_find_extent(struct btrfs_root *root, - u64 logical, - struct btrfs_key *top, int level) -{ - int ret; - struct reada_extent *re = NULL; - struct reada_extent *re_exist = NULL; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_bio *bbio = NULL; - struct btrfs_device *dev; - struct btrfs_device *prev_dev; - u32 blocksize; - u64 length; - int nzones = 0; - int i; - unsigned long index = logical >> PAGE_CACHE_SHIFT; - - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - kref_get(&re->refcnt); - spin_unlock(&fs_info->reada_lock); - - if (re) - return re; - - re = kzalloc(sizeof(*re), GFP_NOFS); - if (!re) - return NULL; - - blocksize = btrfs_level_size(root, level); - re->logical = logical; - re->blocksize = blocksize; - re->top = *top; - INIT_LIST_HEAD(&re->extctl); - spin_lock_init(&re->lock); - kref_init(&re->refcnt); - - /* - * map block - */ - length = blocksize; - ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); - if (ret || !bbio || length < blocksize) - goto error; - - if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { - printk(KERN_ERR "btrfs readahead: more than %d copies not " - "supported", BTRFS_MAX_MIRRORS); - goto error; - } - - for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { - struct reada_zone *zone; - - dev = bbio->stripes[nzones].dev; - zone = reada_find_zone(fs_info, dev, logical, bbio); - if (!zone) - break; - - re->zones[nzones] = zone; - spin_lock(&zone->lock); - if (!zone->elems) - kref_get(&zone->refcnt); - ++zone->elems; - spin_unlock(&zone->lock); - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - re->nzones = nzones; - if (nzones == 0) { - /* not a single zone found, error and out */ - goto error; - } - - /* insert extent in reada_tree + all per-device trees, all or nothing */ - spin_lock(&fs_info->reada_lock); - ret = radix_tree_insert(&fs_info->reada_tree, index, re); - if (ret == -EEXIST) { - re_exist = radix_tree_lookup(&fs_info->reada_tree, index); - BUG_ON(!re_exist); - kref_get(&re_exist->refcnt); - spin_unlock(&fs_info->reada_lock); - goto error; - } - if (ret) { - spin_unlock(&fs_info->reada_lock); - goto error; - } - prev_dev = NULL; - for (i = 0; i < nzones; ++i) { - dev = bbio->stripes[i].dev; - if (dev == prev_dev) { - /* - * in case of DUP, just add the first zone. As both - * are on the same device, there's nothing to gain - * from adding both. - * Also, it wouldn't work, as the tree is per device - * and adding would fail with EEXIST - */ - continue; - } - prev_dev = dev; - ret = radix_tree_insert(&dev->reada_extents, index, re); - if (ret) { - while (--i >= 0) { - dev = bbio->stripes[i].dev; - BUG_ON(dev == NULL); - radix_tree_delete(&dev->reada_extents, index); - } - BUG_ON(fs_info == NULL); - radix_tree_delete(&fs_info->reada_tree, index); - spin_unlock(&fs_info->reada_lock); - goto error; - } - } - spin_unlock(&fs_info->reada_lock); - - kfree(bbio); - return re; - -error: - while (nzones) { - struct reada_zone *zone; - - --nzones; - zone = re->zones[nzones]; - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* - * no fs_info->reada_lock needed, as this can't be - * the last ref - */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - kfree(bbio); - kfree(re); - return re_exist; -} - -static void reada_kref_dummy(struct kref *kr) -{ -} - -static void reada_extent_put(struct btrfs_fs_info *fs_info, - struct reada_extent *re) -{ - int i; - unsigned long index = re->logical >> PAGE_CACHE_SHIFT; - - spin_lock(&fs_info->reada_lock); - if (!kref_put(&re->refcnt, reada_kref_dummy)) { - spin_unlock(&fs_info->reada_lock); - return; - } - - radix_tree_delete(&fs_info->reada_tree, index); - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - radix_tree_delete(&zone->device->reada_extents, index); - } - - spin_unlock(&fs_info->reada_lock); - - for (i = 0; i < re->nzones; ++i) { - struct reada_zone *zone = re->zones[i]; - - kref_get(&zone->refcnt); - spin_lock(&zone->lock); - --zone->elems; - if (zone->elems == 0) { - /* no fs_info->reada_lock needed, as this can't be - * the last ref */ - kref_put(&zone->refcnt, reada_zone_release); - } - spin_unlock(&zone->lock); - - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); - spin_unlock(&fs_info->reada_lock); - } - if (re->scheduled_for) - atomic_dec(&re->scheduled_for->reada_in_flight); - - kfree(re); -} - -static void reada_zone_release(struct kref *kref) -{ - struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); - - radix_tree_delete(&zone->device->reada_zones, - zone->end >> PAGE_CACHE_SHIFT); - - kfree(zone); -} - -static void reada_control_release(struct kref *kref) -{ - struct reada_control *rc = container_of(kref, struct reada_control, - refcnt); - - kfree(rc); -} - -static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation) -{ - struct btrfs_root *root = rc->root; - struct reada_extent *re; - struct reada_extctl *rec; - - re = reada_find_extent(root, logical, top, level); /* takes one ref */ - if (!re) - return -1; - - rec = kzalloc(sizeof(*rec), GFP_NOFS); - if (!rec) { - reada_extent_put(root->fs_info, re); - return -1; - } - - rec->rc = rc; - rec->generation = generation; - atomic_inc(&rc->elems); - - spin_lock(&re->lock); - list_add_tail(&rec->list, &re->extctl); - spin_unlock(&re->lock); - - /* leave the ref on the extent */ - - return 0; -} - -/* - * called with fs_info->reada_lock held - */ -static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) -{ - int i; - unsigned long index = zone->end >> PAGE_CACHE_SHIFT; - - for (i = 0; i < zone->ndevs; ++i) { - struct reada_zone *peer; - peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); - if (peer && peer->device != zone->device) - peer->locked = lock; - } -} - -/* - * called with fs_info->reada_lock held - */ -static int reada_pick_zone(struct btrfs_device *dev) -{ - struct reada_zone *top_zone = NULL; - struct reada_zone *top_locked_zone = NULL; - u64 top_elems = 0; - u64 top_locked_elems = 0; - unsigned long index = 0; - int ret; - - if (dev->reada_curr_zone) { - reada_peer_zones_set_lock(dev->reada_curr_zone, 0); - kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); - dev->reada_curr_zone = NULL; - } - /* pick the zone with the most elements */ - while (1) { - struct reada_zone *zone; - - ret = radix_tree_gang_lookup(&dev->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - index = (zone->end >> PAGE_CACHE_SHIFT) + 1; - if (zone->locked) { - if (zone->elems > top_locked_elems) { - top_locked_elems = zone->elems; - top_locked_zone = zone; - } - } else { - if (zone->elems > top_elems) { - top_elems = zone->elems; - top_zone = zone; - } - } - } - if (top_zone) - dev->reada_curr_zone = top_zone; - else if (top_locked_zone) - dev->reada_curr_zone = top_locked_zone; - else - return 0; - - dev->reada_next = dev->reada_curr_zone->start; - kref_get(&dev->reada_curr_zone->refcnt); - reada_peer_zones_set_lock(dev->reada_curr_zone, 1); - - return 1; -} - -static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) -{ - struct reada_extent *re = NULL; - int mirror_num = 0; - struct extent_buffer *eb = NULL; - u64 logical; - u32 blocksize; - int ret; - int i; - int need_kick = 0; - - spin_lock(&fs_info->reada_lock); - if (dev->reada_curr_zone == NULL) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - } - /* - * FIXME currently we issue the reads one extent at a time. If we have - * a contiguous block of extents, we could also coagulate them or use - * plugging to speed things up - */ - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_CACHE_SHIFT, 1); - if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { - ret = reada_pick_zone(dev); - if (!ret) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - re = NULL; - ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, - dev->reada_next >> PAGE_CACHE_SHIFT, 1); - } - if (ret == 0) { - spin_unlock(&fs_info->reada_lock); - return 0; - } - dev->reada_next = re->logical + re->blocksize; - kref_get(&re->refcnt); - - spin_unlock(&fs_info->reada_lock); - - /* - * find mirror num - */ - for (i = 0; i < re->nzones; ++i) { - if (re->zones[i]->device == dev) { - mirror_num = i + 1; - break; - } - } - logical = re->logical; - blocksize = re->blocksize; - - spin_lock(&re->lock); - if (re->scheduled_for == NULL) { - re->scheduled_for = dev; - need_kick = 1; - } - spin_unlock(&re->lock); - - reada_extent_put(fs_info, re); - - if (!need_kick) - return 0; - - atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, - mirror_num, &eb); - if (ret) - __readahead_hook(fs_info->extent_root, NULL, logical, ret); - else if (eb) - __readahead_hook(fs_info->extent_root, eb, eb->start, ret); - - if (eb) - free_extent_buffer(eb); - - return 1; - -} - -static void reada_start_machine_worker(struct btrfs_work *work) -{ - struct reada_machine_work *rmw; - struct btrfs_fs_info *fs_info; - - rmw = container_of(work, struct reada_machine_work, work); - fs_info = rmw->fs_info; - - kfree(rmw); - - __reada_start_machine(fs_info); -} - -static void __reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - u64 enqueued; - u64 total = 0; - int i; - - do { - enqueued = 0; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (atomic_read(&device->reada_in_flight) < - MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(fs_info, - device); - } - total += enqueued; - } while (enqueued && total < 10000); - - if (enqueued == 0) - return; - - /* - * If everything is already in the cache, this is effectively single - * threaded. To a) not hold the caller for too long and b) to utilize - * more cores, we broke the loop above after 10000 iterations and now - * enqueue to workers to finish it. This will distribute the load to - * the cores. - */ - for (i = 0; i < 2; ++i) - reada_start_machine(fs_info); -} - -static void reada_start_machine(struct btrfs_fs_info *fs_info) -{ - struct reada_machine_work *rmw; - - rmw = kzalloc(sizeof(*rmw), GFP_NOFS); - if (!rmw) { - /* FIXME we cannot handle this properly right now */ - BUG(); - } - rmw->work.func = reada_start_machine_worker; - rmw->fs_info = fs_info; - - btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); -} - -#ifdef DEBUG -static void dump_devs(struct btrfs_fs_info *fs_info, int all) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - unsigned long index; - int ret; - int i; - int j; - int cnt; - - spin_lock(&fs_info->reada_lock); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, - atomic_read(&device->reada_in_flight)); - index = 0; - while (1) { - struct reada_zone *zone; - ret = radix_tree_gang_lookup(&device->reada_zones, - (void **)&zone, index, 1); - if (ret == 0) - break; - printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " - "%d devs", zone->start, zone->end, zone->elems, - zone->locked); - for (j = 0; j < zone->ndevs; ++j) { - printk(KERN_CONT " %lld", - zone->devs[j]->devid); - } - if (device->reada_curr_zone == zone) - printk(KERN_CONT " curr off %llu", - device->reada_next - zone->start); - printk(KERN_CONT "\n"); - index = (zone->end >> PAGE_CACHE_SHIFT) + 1; - } - cnt = 0; - index = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&device->reada_extents, - (void **)&re, index, 1); - if (ret == 0) - break; - printk(KERN_DEBUG - " re: logical %llu size %u empty %d for %lld", - re->logical, re->blocksize, - list_empty(&re->extctl), re->scheduled_for ? - re->scheduled_for->devid : -1); - - for (i = 0; i < re->nzones; ++i) { - printk(KERN_CONT " zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - printk(KERN_CONT " %lld", - re->zones[i]->devs[j]->devid); - } - } - printk(KERN_CONT "\n"); - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; - if (++cnt > 15) - break; - } - } - - index = 0; - cnt = 0; - while (all) { - struct reada_extent *re = NULL; - - ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, - index, 1); - if (ret == 0) - break; - if (!re->scheduled_for) { - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; - continue; - } - printk(KERN_DEBUG - "re: logical %llu size %u list empty %d for %lld", - re->logical, re->blocksize, list_empty(&re->extctl), - re->scheduled_for ? re->scheduled_for->devid : -1); - for (i = 0; i < re->nzones; ++i) { - printk(KERN_CONT " zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (i = 0; i < re->nzones; ++i) { - printk(KERN_CONT " zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - printk(KERN_CONT " %lld", - re->zones[i]->devs[j]->devid); - } - } - } - printk(KERN_CONT "\n"); - index = (re->logical >> PAGE_CACHE_SHIFT) + 1; - } - spin_unlock(&fs_info->reada_lock); -} -#endif - -/* - * interface - */ -struct reada_control *btrfs_reada_add(struct btrfs_root *root, - struct btrfs_key *key_start, struct btrfs_key *key_end) -{ - struct reada_control *rc; - u64 start; - u64 generation; - int level; - struct extent_buffer *node; - static struct btrfs_key max_key = { - .objectid = (u64)-1, - .type = (u8)-1, - .offset = (u64)-1 - }; - - rc = kzalloc(sizeof(*rc), GFP_NOFS); - if (!rc) - return ERR_PTR(-ENOMEM); - - rc->root = root; - rc->key_start = *key_start; - rc->key_end = *key_end; - atomic_set(&rc->elems, 0); - init_waitqueue_head(&rc->wait); - kref_init(&rc->refcnt); - kref_get(&rc->refcnt); /* one ref for having elements */ - - node = btrfs_root_node(root); - start = node->start; - level = btrfs_header_level(node); - generation = btrfs_header_generation(node); - free_extent_buffer(node); - - reada_add_block(rc, start, &max_key, level, generation); - - reada_start_machine(root->fs_info); - - return rc; -} - -#ifdef DEBUG -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - - while (atomic_read(&rc->elems)) { - wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, - 5 * HZ); - dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); - } - - dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#else -int btrfs_reada_wait(void *handle) -{ - struct reada_control *rc = handle; - - while (atomic_read(&rc->elems)) { - wait_event(rc->wait, atomic_read(&rc->elems) == 0); - } - - kref_put(&rc->refcnt, reada_control_release); - - return 0; -} -#endif - -void btrfs_reada_detach(void *handle) -{ - struct reada_control *rc = handle; - - kref_put(&rc->refcnt, reada_control_release); -} diff --git a/ANDROID_3.4.5/fs/btrfs/relocation.c b/ANDROID_3.4.5/fs/btrfs/relocation.c deleted file mode 100644 index 646ee21b..00000000 --- a/ANDROID_3.4.5/fs/btrfs/relocation.c +++ /dev/null @@ -1,4464 +0,0 @@ -/* - * Copyright (C) 2009 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/writeback.h> -#include <linux/blkdev.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "volumes.h" -#include "locking.h" -#include "btrfs_inode.h" -#include "async-thread.h" -#include "free-space-cache.h" -#include "inode-map.h" - -/* - * backref_node, mapping_node and tree_block start with this - */ -struct tree_entry { - struct rb_node rb_node; - u64 bytenr; -}; - -/* - * present a tree block in the backref cache - */ -struct backref_node { - struct rb_node rb_node; - u64 bytenr; - - u64 new_bytenr; - /* objectid of tree block owner, can be not uptodate */ - u64 owner; - /* link to pending, changed or detached list */ - struct list_head list; - /* list of upper level blocks reference this block */ - struct list_head upper; - /* list of child blocks in the cache */ - struct list_head lower; - /* NULL if this node is not tree root */ - struct btrfs_root *root; - /* extent buffer got by COW the block */ - struct extent_buffer *eb; - /* level of tree block */ - unsigned int level:8; - /* is the block in non-reference counted tree */ - unsigned int cowonly:1; - /* 1 if no child node in the cache */ - unsigned int lowest:1; - /* is the extent buffer locked */ - unsigned int locked:1; - /* has the block been processed */ - unsigned int processed:1; - /* have backrefs of this block been checked */ - unsigned int checked:1; - /* - * 1 if corresponding block has been cowed but some upper - * level block pointers may not point to the new location - */ - unsigned int pending:1; - /* - * 1 if the backref node isn't connected to any other - * backref node. - */ - unsigned int detached:1; -}; - -/* - * present a block pointer in the backref cache - */ -struct backref_edge { - struct list_head list[2]; - struct backref_node *node[2]; -}; - -#define LOWER 0 -#define UPPER 1 - -struct backref_cache { - /* red black tree of all backref nodes in the cache */ - struct rb_root rb_root; - /* for passing backref nodes to btrfs_reloc_cow_block */ - struct backref_node *path[BTRFS_MAX_LEVEL]; - /* - * list of blocks that have been cowed but some block - * pointers in upper level blocks may not reflect the - * new location - */ - struct list_head pending[BTRFS_MAX_LEVEL]; - /* list of backref nodes with no child node */ - struct list_head leaves; - /* list of blocks that have been cowed in current transaction */ - struct list_head changed; - /* list of detached backref node. */ - struct list_head detached; - - u64 last_trans; - - int nr_nodes; - int nr_edges; -}; - -/* - * map address of tree root to tree - */ -struct mapping_node { - struct rb_node rb_node; - u64 bytenr; - void *data; -}; - -struct mapping_tree { - struct rb_root rb_root; - spinlock_t lock; -}; - -/* - * present a tree block to process - */ -struct tree_block { - struct rb_node rb_node; - u64 bytenr; - struct btrfs_key key; - unsigned int level:8; - unsigned int key_ready:1; -}; - -#define MAX_EXTENTS 128 - -struct file_extent_cluster { - u64 start; - u64 end; - u64 boundary[MAX_EXTENTS]; - unsigned int nr; -}; - -struct reloc_control { - /* block group to relocate */ - struct btrfs_block_group_cache *block_group; - /* extent tree */ - struct btrfs_root *extent_root; - /* inode for moving data */ - struct inode *data_inode; - - struct btrfs_block_rsv *block_rsv; - - struct backref_cache backref_cache; - - struct file_extent_cluster cluster; - /* tree blocks have been processed */ - struct extent_io_tree processed_blocks; - /* map start of tree root to corresponding reloc tree */ - struct mapping_tree reloc_root_tree; - /* list of reloc trees */ - struct list_head reloc_roots; - /* size of metadata reservation for merging reloc trees */ - u64 merging_rsv_size; - /* size of relocated tree nodes */ - u64 nodes_relocated; - - u64 search_start; - u64 extents_found; - - unsigned int stage:8; - unsigned int create_reloc_tree:1; - unsigned int merge_reloc_tree:1; - unsigned int found_file_extent:1; - unsigned int commit_transaction:1; -}; - -/* stages of data relocation */ -#define MOVE_DATA_EXTENTS 0 -#define UPDATE_DATA_PTRS 1 - -static void remove_backref_node(struct backref_cache *cache, - struct backref_node *node); -static void __mark_block_processed(struct reloc_control *rc, - struct backref_node *node); - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root = RB_ROOT; - spin_lock_init(&tree->lock); -} - -static void backref_cache_init(struct backref_cache *cache) -{ - int i; - cache->rb_root = RB_ROOT; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); -} - -static void backref_cache_cleanup(struct backref_cache *cache) -{ - struct backref_node *node; - int i; - - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct backref_node, list); - remove_backref_node(cache, node); - } - - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct backref_node, lower); - remove_backref_node(cache, node); - } - - cache->last_trans = 0; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) - BUG_ON(!list_empty(&cache->pending[i])); - BUG_ON(!list_empty(&cache->changed)); - BUG_ON(!list_empty(&cache->detached)); - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - BUG_ON(cache->nr_nodes); - BUG_ON(cache->nr_edges); -} - -static struct backref_node *alloc_backref_node(struct backref_cache *cache) -{ - struct backref_node *node; - - node = kzalloc(sizeof(*node), GFP_NOFS); - if (node) { - INIT_LIST_HEAD(&node->list); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); - cache->nr_nodes++; - } - return node; -} - -static void free_backref_node(struct backref_cache *cache, - struct backref_node *node) -{ - if (node) { - cache->nr_nodes--; - kfree(node); - } -} - -static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) -{ - struct backref_edge *edge; - - edge = kzalloc(sizeof(*edge), GFP_NOFS); - if (edge) - cache->nr_edges++; - return edge; -} - -static void free_backref_edge(struct backref_cache *cache, - struct backref_edge *edge) -{ - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} - -static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) -{ - struct rb_node *n = root->rb_node; - struct tree_entry *entry; - - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return n; - } - return NULL; -} - -void backref_tree_panic(struct rb_node *rb_node, int errno, - u64 bytenr) -{ - - struct btrfs_fs_info *fs_info = NULL; - struct backref_node *bnode = rb_entry(rb_node, struct backref_node, - rb_node); - if (bnode->root) - fs_info = bnode->root->fs_info; - btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", (unsigned long long)bytenr); -} - -/* - * walk up backref nodes until reach node presents tree root - */ -static struct backref_node *walk_up_backref(struct backref_node *node, - struct backref_edge *edges[], - int *index) -{ - struct backref_edge *edge; - int idx = *index; - - while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, - struct backref_edge, list[LOWER]); - edges[idx++] = edge; - node = edge->node[UPPER]; - } - BUG_ON(node->detached); - *index = idx; - return node; -} - -/* - * walk down backref nodes to find start of next reference path - */ -static struct backref_node *walk_down_backref(struct backref_edge *edges[], - int *index) -{ - struct backref_edge *edge; - struct backref_node *lower; - int idx = *index; - - while (idx > 0) { - edge = edges[idx - 1]; - lower = edge->node[LOWER]; - if (list_is_last(&edge->list[LOWER], &lower->upper)) { - idx--; - continue; - } - edge = list_entry(edge->list[LOWER].next, - struct backref_edge, list[LOWER]); - edges[idx - 1] = edge; - *index = idx; - return edge->node[UPPER]; - } - *index = 0; - return NULL; -} - -static void unlock_node_buffer(struct backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} - -static void drop_node_buffer(struct backref_node *node) -{ - if (node->eb) { - unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -static void drop_backref_node(struct backref_cache *tree, - struct backref_node *node) -{ - BUG_ON(!list_empty(&node->upper)); - - drop_node_buffer(node); - list_del(&node->list); - list_del(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - free_backref_node(tree, node); -} - -/* - * remove a backref node from the backref cache - */ -static void remove_backref_node(struct backref_cache *cache, - struct backref_node *node) -{ - struct backref_node *upper; - struct backref_edge *edge; - - if (!node) - return; - - BUG_ON(!node->lowest && !node->detached); - while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct backref_edge, - list[LOWER]); - upper = edge->node[UPPER]; - list_del(&edge->list[LOWER]); - list_del(&edge->list[UPPER]); - free_backref_edge(cache, edge); - - if (RB_EMPTY_NODE(&upper->rb_node)) { - BUG_ON(!list_empty(&node->upper)); - drop_backref_node(cache, node); - node = upper; - node->lowest = 1; - continue; - } - /* - * add the node to leaf node list if no other - * child block cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } - } - - drop_backref_node(cache, node); -} - -static void update_backref_node(struct backref_cache *cache, - struct backref_node *node, u64 bytenr) -{ - struct rb_node *rb_node; - rb_erase(&node->rb_node, &cache->rb_root); - node->bytenr = bytenr; - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, bytenr); -} - -/* - * update backref cache after a transaction commit - */ -static int update_backref_cache(struct btrfs_trans_handle *trans, - struct backref_cache *cache) -{ - struct backref_node *node; - int level = 0; - - if (cache->last_trans == 0) { - cache->last_trans = trans->transid; - return 0; - } - - if (cache->last_trans == trans->transid) - return 0; - - /* - * detached nodes are used to avoid unnecessary backref - * lookup. transaction commit changes the extent tree. - * so the detached nodes are no longer useful. - */ - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct backref_node, list); - remove_backref_node(cache, node); - } - - while (!list_empty(&cache->changed)) { - node = list_entry(cache->changed.next, - struct backref_node, list); - list_del_init(&node->list); - BUG_ON(node->pending); - update_backref_node(cache, node, node->new_bytenr); - } - - /* - * some nodes can be left in the pending list if there were - * errors during processing the pending nodes. - */ - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - list_for_each_entry(node, &cache->pending[level], list) { - BUG_ON(!node->pending); - if (node->bytenr == node->new_bytenr) - continue; - update_backref_node(cache, node, node->new_bytenr); - } - } - - cache->last_trans = 0; - return 1; -} - - -static int should_ignore_root(struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - - if (!root->ref_cows) - return 0; - - reloc_root = root->reloc_root; - if (!reloc_root) - return 0; - - if (btrfs_root_last_snapshot(&reloc_root->root_item) == - root->fs_info->running_transaction->transid - 1) - return 0; - /* - * if there is reloc tree and it was created in previous - * transaction backref lookup can find the reloc tree, - * so backref node for the fs tree root is useless for - * relocation. - */ - return 1; -} -/* - * find reloc tree by address of tree root - */ -static struct btrfs_root *find_reloc_root(struct reloc_control *rc, - u64 bytenr) -{ - struct rb_node *rb_node; - struct mapping_node *node; - struct btrfs_root *root = NULL; - - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); - if (rb_node) { - node = rb_entry(rb_node, struct mapping_node, rb_node); - root = (struct btrfs_root *)node->data; - } - spin_unlock(&rc->reloc_root_tree.lock); - return root; -} - -static int is_cowonly_root(u64 root_objectid) -{ - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || - root_objectid == BTRFS_EXTENT_TREE_OBJECTID || - root_objectid == BTRFS_CHUNK_TREE_OBJECTID || - root_objectid == BTRFS_DEV_TREE_OBJECTID || - root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) - return 1; - return 0; -} - -static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, - u64 root_objectid) -{ - struct btrfs_key key; - - key.objectid = root_objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - if (is_cowonly_root(root_objectid)) - key.offset = 0; - else - key.offset = (u64)-1; - - return btrfs_read_fs_root_no_name(fs_info, &key); -} - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static noinline_for_stack -struct btrfs_root *find_tree_root(struct reloc_control *rc, - struct extent_buffer *leaf, - struct btrfs_extent_ref_v0 *ref0) -{ - struct btrfs_root *root; - u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); - u64 generation = btrfs_ref_generation_v0(leaf, ref0); - - BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); - - root = read_fs_root(rc->extent_root->fs_info, root_objectid); - BUG_ON(IS_ERR(root)); - - if (root->ref_cows && - generation != btrfs_root_generation(&root->root_item)) - return NULL; - - return root; -} -#endif - -static noinline_for_stack -int find_inline_backref(struct extent_buffer *leaf, int slot, - unsigned long *ptr, unsigned long *end) -{ - struct btrfs_extent_item *ei; - struct btrfs_tree_block_info *bi; - u32 item_size; - - item_size = btrfs_item_size_nr(leaf, slot); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (item_size < sizeof(*ei)) { - WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - return 1; - } -#endif - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); - WARN_ON(!(btrfs_extent_flags(leaf, ei) & - BTRFS_EXTENT_FLAG_TREE_BLOCK)); - - if (item_size <= sizeof(*ei) + sizeof(*bi)) { - WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); - return 1; - } - - bi = (struct btrfs_tree_block_info *)(ei + 1); - *ptr = (unsigned long)(bi + 1); - *end = (unsigned long)ei + item_size; - return 0; -} - -/* - * build backref tree for a given tree block. root of the backref tree - * corresponds the tree block, leaves of the backref tree correspond - * roots of b-trees that reference the tree block. - * - * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop - * when tree root is reached or backrefs for the block is cached. - * - * NOTE: if we find backrefs for a block are cached, we know backrefs - * for all upper level blocks that directly/indirectly reference the - * block are also cached. - */ -static noinline_for_stack -struct backref_node *build_backref_tree(struct reloc_control *rc, - struct btrfs_key *node_key, - int level, u64 bytenr) -{ - struct backref_cache *cache = &rc->backref_cache; - struct btrfs_path *path1; - struct btrfs_path *path2; - struct extent_buffer *eb; - struct btrfs_root *root; - struct backref_node *cur; - struct backref_node *upper; - struct backref_node *lower; - struct backref_node *node = NULL; - struct backref_node *exist = NULL; - struct backref_edge *edge; - struct rb_node *rb_node; - struct btrfs_key key; - unsigned long end; - unsigned long ptr; - LIST_HEAD(list); - LIST_HEAD(useless); - int cowonly; - int ret; - int err = 0; - - path1 = btrfs_alloc_path(); - path2 = btrfs_alloc_path(); - if (!path1 || !path2) { - err = -ENOMEM; - goto out; - } - path1->reada = 1; - path2->reada = 2; - - node = alloc_backref_node(cache); - if (!node) { - err = -ENOMEM; - goto out; - } - - node->bytenr = bytenr; - node->level = level; - node->lowest = 1; - cur = node; -again: - end = 0; - ptr = 0; - key.objectid = cur->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)-1; - - path1->search_commit_root = 1; - path1->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, - 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - BUG_ON(!ret || !path1->slots[0]); - - path1->slots[0]--; - - WARN_ON(cur->checked); - if (!list_empty(&cur->upper)) { - /* - * the backref was added previously when processing - * backref of type BTRFS_TREE_BLOCK_REF_KEY - */ - BUG_ON(!list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct backref_edge, - list[LOWER]); - BUG_ON(!list_empty(&edge->list[UPPER])); - exist = edge->node[UPPER]; - /* - * add the upper level block to pending list if we need - * check its backrefs - */ - if (!exist->checked) - list_add_tail(&edge->list[UPPER], &list); - } else { - exist = NULL; - } - - while (1) { - cond_resched(); - eb = path1->nodes[0]; - - if (ptr >= end) { - if (path1->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) - break; - eb = path1->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); - if (key.objectid != cur->bytenr) { - WARN_ON(exist); - break; - } - - if (key.type == BTRFS_EXTENT_ITEM_KEY) { - ret = find_inline_backref(eb, path1->slots[0], - &ptr, &end); - if (ret) - goto next; - } - } - - if (ptr < end) { - /* update key for inline back ref */ - struct btrfs_extent_inline_ref *iref; - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_extent_inline_ref_type(eb, iref); - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && - key.type != BTRFS_SHARED_BLOCK_REF_KEY); - } - - if (exist && - ((key.type == BTRFS_TREE_BLOCK_REF_KEY && - exist->owner == key.offset) || - (key.type == BTRFS_SHARED_BLOCK_REF_KEY && - exist->bytenr == key.offset))) { - exist = NULL; - goto next; - } - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || - key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.type == BTRFS_EXTENT_REF_V0_KEY) { - struct btrfs_extent_ref_v0 *ref0; - ref0 = btrfs_item_ptr(eb, path1->slots[0], - struct btrfs_extent_ref_v0); - if (key.objectid == key.offset) { - root = find_tree_root(rc, eb, ref0); - if (root && !should_ignore_root(root)) - cur->root = root; - else - list_add(&cur->list, &useless); - break; - } - if (is_cowonly_root(btrfs_ref_root_v0(eb, - ref0))) - cur->cowonly = 1; - } -#else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); - if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { -#endif - if (key.objectid == key.offset) { - /* - * only root blocks of reloc trees use - * backref of this type. - */ - root = find_reloc_root(rc, cur->bytenr); - BUG_ON(!root); - cur->root = root; - break; - } - - edge = alloc_backref_edge(cache); - if (!edge) { - err = -ENOMEM; - goto out; - } - rb_node = tree_search(&cache->rb_root, key.offset); - if (!rb_node) { - upper = alloc_backref_node(cache); - if (!upper) { - free_backref_edge(cache, edge); - err = -ENOMEM; - goto out; - } - upper->bytenr = key.offset; - upper->level = cur->level + 1; - /* - * backrefs for the upper level block isn't - * cached, add the block to pending list - */ - list_add_tail(&edge->list[UPPER], &list); - } else { - upper = rb_entry(rb_node, struct backref_node, - rb_node); - BUG_ON(!upper->checked); - INIT_LIST_HEAD(&edge->list[UPPER]); - } - list_add_tail(&edge->list[LOWER], &cur->upper); - edge->node[LOWER] = cur; - edge->node[UPPER] = upper; - - goto next; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - goto next; - } - - /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ - root = read_fs_root(rc->extent_root->fs_info, key.offset); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - if (!root->ref_cows) - cur->cowonly = 1; - - if (btrfs_root_level(&root->root_item) == cur->level) { - /* tree root */ - BUG_ON(btrfs_root_bytenr(&root->root_item) != - cur->bytenr); - if (should_ignore_root(root)) - list_add(&cur->list, &useless); - else - cur->root = root; - break; - } - - level = cur->level + 1; - - /* - * searching the tree to find upper level blocks - * reference the block. - */ - path2->search_commit_root = 1; - path2->skip_locking = 1; - path2->lowest_level = level; - ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); - path2->lowest_level = 0; - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0 && path2->slots[level] > 0) - path2->slots[level]--; - - eb = path2->nodes[level]; - WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != - cur->bytenr); - - lower = cur; - for (; level < BTRFS_MAX_LEVEL; level++) { - if (!path2->nodes[level]) { - BUG_ON(btrfs_root_bytenr(&root->root_item) != - lower->bytenr); - if (should_ignore_root(root)) - list_add(&lower->list, &useless); - else - lower->root = root; - break; - } - - edge = alloc_backref_edge(cache); - if (!edge) { - err = -ENOMEM; - goto out; - } - - eb = path2->nodes[level]; - rb_node = tree_search(&cache->rb_root, eb->start); - if (!rb_node) { - upper = alloc_backref_node(cache); - if (!upper) { - free_backref_edge(cache, edge); - err = -ENOMEM; - goto out; - } - upper->bytenr = eb->start; - upper->owner = btrfs_header_owner(eb); - upper->level = lower->level + 1; - if (!root->ref_cows) - upper->cowonly = 1; - - /* - * if we know the block isn't shared - * we can void checking its backrefs. - */ - if (btrfs_block_can_be_shared(root, eb)) - upper->checked = 0; - else - upper->checked = 1; - - /* - * add the block to pending list if we - * need check its backrefs. only block - * at 'cur->level + 1' is added to the - * tail of pending list. this guarantees - * we check backrefs from lower level - * blocks to upper level blocks. - */ - if (!upper->checked && - level == cur->level + 1) { - list_add_tail(&edge->list[UPPER], - &list); - } else - INIT_LIST_HEAD(&edge->list[UPPER]); - } else { - upper = rb_entry(rb_node, struct backref_node, - rb_node); - BUG_ON(!upper->checked); - INIT_LIST_HEAD(&edge->list[UPPER]); - if (!upper->owner) - upper->owner = btrfs_header_owner(eb); - } - list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - - if (rb_node) - break; - lower = upper; - upper = NULL; - } - btrfs_release_path(path2); -next: - if (ptr < end) { - ptr += btrfs_extent_inline_ref_size(key.type); - if (ptr >= end) { - WARN_ON(ptr > end); - ptr = 0; - end = 0; - } - } - if (ptr >= end) - path1->slots[0]++; - } - btrfs_release_path(path1); - - cur->checked = 1; - WARN_ON(exist); - - /* the pending list isn't empty, take the first block to process */ - if (!list_empty(&list)) { - edge = list_entry(list.next, struct backref_edge, list[UPPER]); - list_del_init(&edge->list[UPPER]); - cur = edge->node[UPPER]; - goto again; - } - - /* - * everything goes well, connect backref nodes and insert backref nodes - * into the cache. - */ - BUG_ON(!node->checked); - cowonly = node->cowonly; - if (!cowonly) { - rb_node = tree_insert(&cache->rb_root, node->bytenr, - &node->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - list_add_tail(&node->lower, &cache->leaves); - } - - list_for_each_entry(edge, &node->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); - - while (!list_empty(&list)) { - edge = list_entry(list.next, struct backref_edge, list[UPPER]); - list_del_init(&edge->list[UPPER]); - upper = edge->node[UPPER]; - if (upper->detached) { - list_del(&edge->list[LOWER]); - lower = edge->node[LOWER]; - free_backref_edge(cache, edge); - if (list_empty(&lower->upper)) - list_add(&lower->list, &useless); - continue; - } - - if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - - list_add_tail(&edge->list[UPPER], &upper->lower); - continue; - } - - BUG_ON(!upper->checked); - BUG_ON(cowonly != upper->cowonly); - if (!cowonly) { - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, - upper->bytenr); - } - - list_add_tail(&edge->list[UPPER], &upper->lower); - - list_for_each_entry(edge, &upper->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); - } - /* - * process useless backref nodes. backref nodes for tree leaves - * are deleted from the cache. backref nodes for upper level - * tree blocks are left in the cache to avoid unnecessary backref - * lookup. - */ - while (!list_empty(&useless)) { - upper = list_entry(useless.next, struct backref_node, list); - list_del_init(&upper->list); - BUG_ON(!list_empty(&upper->upper)); - if (upper == node) - node = NULL; - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - while (!list_empty(&upper->lower)) { - edge = list_entry(upper->lower.next, - struct backref_edge, list[UPPER]); - list_del(&edge->list[UPPER]); - list_del(&edge->list[LOWER]); - lower = edge->node[LOWER]; - free_backref_edge(cache, edge); - - if (list_empty(&lower->upper)) - list_add(&lower->list, &useless); - } - __mark_block_processed(rc, upper); - if (upper->level > 0) { - list_add(&upper->list, &cache->detached); - upper->detached = 1; - } else { - rb_erase(&upper->rb_node, &cache->rb_root); - free_backref_node(cache, upper); - } - } -out: - btrfs_free_path(path1); - btrfs_free_path(path2); - if (err) { - while (!list_empty(&useless)) { - lower = list_entry(useless.next, - struct backref_node, upper); - list_del_init(&lower->upper); - } - upper = node; - INIT_LIST_HEAD(&list); - while (upper) { - if (RB_EMPTY_NODE(&upper->rb_node)) { - list_splice_tail(&upper->upper, &list); - free_backref_node(cache, upper); - } - - if (list_empty(&list)) - break; - - edge = list_entry(list.next, struct backref_edge, - list[LOWER]); - list_del(&edge->list[LOWER]); - upper = edge->node[UPPER]; - free_backref_edge(cache, edge); - } - return ERR_PTR(err); - } - BUG_ON(node && node->detached); - return node; -} - -/* - * helper to add backref node for the newly created snapshot. - * the backref node is created by cloning backref node that - * corresponds to root of source tree - */ -static int clone_backref_node(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *src, - struct btrfs_root *dest) -{ - struct btrfs_root *reloc_root = src->reloc_root; - struct backref_cache *cache = &rc->backref_cache; - struct backref_node *node = NULL; - struct backref_node *new_node; - struct backref_edge *edge; - struct backref_edge *new_edge; - struct rb_node *rb_node; - - if (cache->last_trans > 0) - update_backref_cache(trans, cache); - - rb_node = tree_search(&cache->rb_root, src->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct backref_node, rb_node); - if (node->detached) - node = NULL; - else - BUG_ON(node->new_bytenr != reloc_root->node->start); - } - - if (!node) { - rb_node = tree_search(&cache->rb_root, - reloc_root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct backref_node, - rb_node); - BUG_ON(node->detached); - } - } - - if (!node) - return 0; - - new_node = alloc_backref_node(cache); - if (!new_node) - return -ENOMEM; - - new_node->bytenr = dest->node->start; - new_node->level = node->level; - new_node->lowest = node->lowest; - new_node->checked = 1; - new_node->root = dest; - - if (!node->lowest) { - list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = alloc_backref_edge(cache); - if (!new_edge) - goto fail; - - new_edge->node[UPPER] = new_node; - new_edge->node[LOWER] = edge->node[LOWER]; - list_add_tail(&new_edge->list[UPPER], - &new_node->lower); - } - } else { - list_add_tail(&new_node->lower, &cache->leaves); - } - - rb_node = tree_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); - - if (!new_node->lowest) { - list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { - list_add_tail(&new_edge->list[LOWER], - &new_edge->node[LOWER]->upper); - } - } - return 0; -fail: - while (!list_empty(&new_node->lower)) { - new_edge = list_entry(new_node->lower.next, - struct backref_edge, list[UPPER]); - list_del(&new_edge->list[UPPER]); - free_backref_edge(cache, new_edge); - } - free_backref_node(cache, new_node); - return -ENOMEM; -} - -/* - * helper to add 'address of tree root -> reloc tree' mapping - */ -static int __must_check __add_reloc_root(struct btrfs_root *root) -{ - struct rb_node *rb_node; - struct mapping_node *node; - struct reloc_control *rc = root->fs_info->reloc_ctl; - - node = kmalloc(sizeof(*node), GFP_NOFS); - if (!node) - return -ENOMEM; - - node->bytenr = root->node->start; - node->data = root; - - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) { - kfree(node); - btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found " - "for start=%llu while inserting into relocation " - "tree\n"); - } - - list_add_tail(&root->root_list, &rc->reloc_roots); - return 0; -} - -/* - * helper to update/delete the 'address of tree root -> reloc tree' - * mapping - */ -static int __update_reloc_root(struct btrfs_root *root, int del) -{ - struct rb_node *rb_node; - struct mapping_node *node = NULL; - struct reloc_control *rc = root->fs_info->reloc_ctl; - - spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); - if (rb_node) { - node = rb_entry(rb_node, struct mapping_node, rb_node); - rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); - } - spin_unlock(&rc->reloc_root_tree.lock); - - BUG_ON((struct btrfs_root *)node->data != root); - - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - } else { - spin_lock(&root->fs_info->trans_lock); - list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); - kfree(node); - } - return 0; -} - -static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid) -{ - struct btrfs_root *reloc_root; - struct extent_buffer *eb; - struct btrfs_root_item *root_item; - struct btrfs_key root_key; - int ret; - - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); - - root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = objectid; - - if (root->root_key.objectid == objectid) { - /* called by btrfs_init_reloc_root */ - ret = btrfs_copy_root(trans, root, root->commit_root, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); - - btrfs_set_root_last_snapshot(&root->root_item, - trans->transid - 1); - } else { - /* - * called by btrfs_reloc_post_snapshot_hook. - * the source tree is a reloc tree, all tree blocks - * modified after it was created have RELOC flag - * set in their headers. so it's OK to not update - * the 'last_snapshot'. - */ - ret = btrfs_copy_root(trans, root, root->node, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); - } - - memcpy(root_item, &root->root_item, sizeof(*root_item)); - btrfs_set_root_bytenr(root_item, eb->start); - btrfs_set_root_level(root_item, btrfs_header_level(eb)); - btrfs_set_root_generation(root_item, trans->transid); - - if (root->root_key.objectid == objectid) { - btrfs_set_root_refs(root_item, 0); - memset(&root_item->drop_progress, 0, - sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; - } - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - - ret = btrfs_insert_root(trans, root->fs_info->tree_root, - &root_key, root_item); - BUG_ON(ret); - kfree(root_item); - - reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &root_key); - BUG_ON(IS_ERR(reloc_root)); - reloc_root->last_trans = trans->transid; - return reloc_root; -} - -/* - * create reloc tree for a given fs tree. reloc tree is just a - * snapshot of the fs tree with special root objectid. - */ -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct reloc_control *rc = root->fs_info->reloc_ctl; - int clear_rsv = 0; - int ret; - - if (root->reloc_root) { - reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; - return 0; - } - - if (!rc || !rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - - if (!trans->block_rsv) { - trans->block_rsv = rc->block_rsv; - clear_rsv = 1; - } - reloc_root = create_reloc_root(trans, root, root->root_key.objectid); - if (clear_rsv) - trans->block_rsv = NULL; - - ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); - root->reloc_root = reloc_root; - return 0; -} - -/* - * update root item of reloc tree - */ -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *reloc_root; - struct btrfs_root_item *root_item; - int del = 0; - int ret; - - if (!root->reloc_root) - goto out; - - reloc_root = root->reloc_root; - root_item = &reloc_root->root_item; - - if (root->fs_info->reloc_ctl->merge_reloc_tree && - btrfs_root_refs(root_item) == 0) { - root->reloc_root = NULL; - del = 1; - } - - __update_reloc_root(reloc_root, del); - - if (reloc_root->commit_root != reloc_root->node) { - btrfs_set_root_node(root_item, reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - reloc_root->commit_root = btrfs_root_node(reloc_root); - } - - ret = btrfs_update_root(trans, root->fs_info->tree_root, - &reloc_root->root_key, root_item); - BUG_ON(ret); - -out: - return 0; -} - -/* - * helper to find first cached inode with inode number >= objectid - * in a subvolume - */ -static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) -{ - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(&entry->vfs_inode)) - node = node->rb_left; - else if (objectid > btrfs_ino(&entry->vfs_inode)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(&entry->vfs_inode)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - return inode; - } - - objectid = btrfs_ino(&entry->vfs_inode) + 1; - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); - return NULL; -} - -static int in_block_group(u64 bytenr, - struct btrfs_block_group_cache *block_group) -{ - if (bytenr >= block_group->key.objectid && - bytenr < block_group->key.objectid + block_group->key.offset) - return 1; - return 0; -} - -/* - * get new location of data - */ -static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, - u64 bytenr, u64 num_bytes) -{ - struct btrfs_root *root = BTRFS_I(reloc_inode)->root; - struct btrfs_path *path; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - bytenr -= BTRFS_I(reloc_inode)->index_cnt; - ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode), - bytenr, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - BUG_ON(btrfs_file_extent_offset(leaf, fi) || - btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)); - - if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { - ret = 1; - goto out; - } - - *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -/* - * update file extent items in the tree leaf to point to - * the new locations. - */ -static noinline_for_stack -int replace_file_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; - u64 parent; - u64 bytenr; - u64 new_bytenr = 0; - u64 num_bytes; - u64 end; - u32 nritems; - u32 i; - int ret; - int first = 1; - int dirty = 0; - - if (rc->stage != UPDATE_DATA_PTRS) - return 0; - - /* reloc trees always use full backref */ - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - parent = leaf->start; - else - parent = 0; - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - if (bytenr == 0) - continue; - if (!in_block_group(bytenr, rc->block_group)) - continue; - - /* - * if we are modifying block in fs tree, wait for readpage - * to complete and drop the extent cache - */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - if (first) { - inode = find_next_inode(root, key.objectid); - first = 0; - } else if (inode && btrfs_ino(inode) < key.objectid) { - btrfs_add_delayed_iput(inode); - inode = find_next_inode(root, key.objectid); - } - if (inode && btrfs_ino(inode) == key.objectid) { - end = key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - WARN_ON(!IS_ALIGNED(key.offset, - root->sectorsize)); - WARN_ON(!IS_ALIGNED(end, root->sectorsize)); - end--; - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); - if (!ret) - continue; - - btrfs_drop_extent_cache(inode, key.offset, end, - 1); - unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); - } - } - - ret = get_new_location(rc->data_inode, &new_bytenr, - bytenr, num_bytes); - if (ret > 0) { - WARN_ON(1); - continue; - } - BUG_ON(ret < 0); - - btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); - dirty = 1; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, root, new_bytenr, - num_bytes, parent, - btrfs_header_owner(leaf), - key.objectid, key.offset, 1); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); - BUG_ON(ret); - } - if (dirty) - btrfs_mark_buffer_dirty(leaf); - if (inode) - btrfs_add_delayed_iput(inode); - return 0; -} - -static noinline_for_stack -int memcmp_node_keys(struct extent_buffer *eb, int slot, - struct btrfs_path *path, int level) -{ - struct btrfs_disk_key key1; - struct btrfs_disk_key key2; - btrfs_node_key(eb, &key1, slot); - btrfs_node_key(path->nodes[level], &key2, path->slots[level]); - return memcmp(&key1, &key2, sizeof(key1)); -} - -/* - * try to replace tree blocks in fs tree with the new blocks - * in reloc tree. tree blocks haven't been modified since the - * reloc tree was create can be replaced. - * - * if a block was replaced, level of the block + 1 is returned. - * if no block got replaced, 0 is returned. if there are other - * errors, a negative error number is returned. - */ -static noinline_for_stack -int replace_path(struct btrfs_trans_handle *trans, - struct btrfs_root *dest, struct btrfs_root *src, - struct btrfs_path *path, struct btrfs_key *next_key, - int lowest_level, int max_level) -{ - struct extent_buffer *eb; - struct extent_buffer *parent; - struct btrfs_key key; - u64 old_bytenr; - u64 new_bytenr; - u64 old_ptr_gen; - u64 new_ptr_gen; - u64 last_snapshot; - u32 blocksize; - int cow = 0; - int level; - int ret; - int slot; - - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - - last_snapshot = btrfs_root_last_snapshot(&src->root_item); -again: - slot = path->slots[lowest_level]; - btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); - - eb = btrfs_lock_root_node(dest); - btrfs_set_lock_blocking(eb); - level = btrfs_header_level(eb); - - if (level < lowest_level) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - return 0; - } - - if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); - BUG_ON(ret); - } - btrfs_set_lock_blocking(eb); - - if (next_key) { - next_key->objectid = (u64)-1; - next_key->type = (u8)-1; - next_key->offset = (u64)-1; - } - - parent = eb; - while (1) { - level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); - - ret = btrfs_bin_search(parent, &key, level, &slot); - if (ret && slot > 0) - slot--; - - if (next_key && slot + 1 < btrfs_header_nritems(parent)) - btrfs_node_key_to_cpu(parent, next_key, slot + 1); - - old_bytenr = btrfs_node_blockptr(parent, slot); - blocksize = btrfs_level_size(dest, level - 1); - old_ptr_gen = btrfs_node_ptr_generation(parent, slot); - - if (level <= max_level) { - eb = path->nodes[level]; - new_bytenr = btrfs_node_blockptr(eb, - path->slots[level]); - new_ptr_gen = btrfs_node_ptr_generation(eb, - path->slots[level]); - } else { - new_bytenr = 0; - new_ptr_gen = 0; - } - - if (new_bytenr > 0 && new_bytenr == old_bytenr) { - WARN_ON(1); - ret = level; - break; - } - - if (new_bytenr == 0 || old_ptr_gen > last_snapshot || - memcmp_node_keys(parent, slot, path, level)) { - if (level <= lowest_level) { - ret = 0; - break; - } - - eb = read_tree_block(dest, old_bytenr, blocksize, - old_ptr_gen); - BUG_ON(!eb); - btrfs_tree_lock(eb); - if (cow) { - ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); - BUG_ON(ret); - } - btrfs_set_lock_blocking(eb); - - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - - parent = eb; - continue; - } - - if (!cow) { - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - cow = 1; - goto again; - } - - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - btrfs_release_path(path); - - path->lowest_level = level; - ret = btrfs_search_slot(trans, src, &key, path, 0, 1); - path->lowest_level = 0; - BUG_ON(ret); - - /* - * swap blocks in fs tree and reloc tree. - */ - btrfs_set_node_blockptr(parent, slot, new_bytenr); - btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(parent); - - btrfs_set_node_blockptr(path->nodes[level], - path->slots[level], old_bytenr); - btrfs_set_node_ptr_generation(path->nodes[level], - path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(path->nodes[level]); - - ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, - path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); - BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0, 1); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, - path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); - BUG_ON(ret); - - ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0, 1); - BUG_ON(ret); - - btrfs_unlock_up_safe(path, 0); - - ret = level; - break; - } - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - return ret; -} - -/* - * helper to find next relocated block in reloc tree - */ -static noinline_for_stack -int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) -{ - struct extent_buffer *eb; - int i; - u64 last_snapshot; - u32 nritems; - - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - - for (i = 0; i < *level; i++) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - - for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { - eb = path->nodes[i]; - nritems = btrfs_header_nritems(eb); - while (path->slots[i] + 1 < nritems) { - path->slots[i]++; - if (btrfs_node_ptr_generation(eb, path->slots[i]) <= - last_snapshot) - continue; - - *level = i; - return 0; - } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - return 1; -} - -/* - * walk down reloc tree to find relocated block of lowest level - */ -static noinline_for_stack -int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) -{ - struct extent_buffer *eb = NULL; - int i; - u64 bytenr; - u64 ptr_gen = 0; - u64 last_snapshot; - u32 blocksize; - u32 nritems; - - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - - for (i = *level; i > 0; i--) { - eb = path->nodes[i]; - nritems = btrfs_header_nritems(eb); - while (path->slots[i] < nritems) { - ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); - if (ptr_gen > last_snapshot) - break; - path->slots[i]++; - } - if (path->slots[i] >= nritems) { - if (i == *level) - break; - *level = i + 1; - return 0; - } - if (i == 1) { - *level = i; - return 0; - } - - bytenr = btrfs_node_blockptr(eb, path->slots[i]); - blocksize = btrfs_level_size(root, i - 1); - eb = read_tree_block(root, bytenr, blocksize, ptr_gen); - BUG_ON(btrfs_header_level(eb) != i - 1); - path->nodes[i - 1] = eb; - path->slots[i - 1] = 0; - } - return 1; -} - -/* - * invalidate extent cache for file extents whose key in range of - * [min_key, max_key) - */ -static int invalidate_extent_cache(struct btrfs_root *root, - struct btrfs_key *min_key, - struct btrfs_key *max_key) -{ - struct inode *inode = NULL; - u64 objectid; - u64 start, end; - u64 ino; - - objectid = min_key->objectid; - while (1) { - cond_resched(); - iput(inode); - - if (objectid > max_key->objectid) - break; - - inode = find_next_inode(root, objectid); - if (!inode) - break; - ino = btrfs_ino(inode); - - if (ino > max_key->objectid) { - iput(inode); - break; - } - - objectid = ino + 1; - if (!S_ISREG(inode->i_mode)) - continue; - - if (unlikely(min_key->objectid == ino)) { - if (min_key->type > BTRFS_EXTENT_DATA_KEY) - continue; - if (min_key->type < BTRFS_EXTENT_DATA_KEY) - start = 0; - else { - start = min_key->offset; - WARN_ON(!IS_ALIGNED(start, root->sectorsize)); - } - } else { - start = 0; - } - - if (unlikely(max_key->objectid == ino)) { - if (max_key->type < BTRFS_EXTENT_DATA_KEY) - continue; - if (max_key->type > BTRFS_EXTENT_DATA_KEY) { - end = (u64)-1; - } else { - if (max_key->offset == 0) - continue; - end = max_key->offset; - WARN_ON(!IS_ALIGNED(end, root->sectorsize)); - end--; - } - } else { - end = (u64)-1; - } - - /* the lock_extent waits for readpage to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - btrfs_drop_extent_cache(inode, start, end, 1); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); - } - return 0; -} - -static int find_next_key(struct btrfs_path *path, int level, - struct btrfs_key *key) - -{ - while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - break; - if (path->slots[level] + 1 < - btrfs_header_nritems(path->nodes[level])) { - btrfs_node_key_to_cpu(path->nodes[level], key, - path->slots[level] + 1); - return 0; - } - level++; - } - return 1; -} - -/* - * merge the relocated tree blocks in reloc tree with corresponding - * fs tree. - */ -static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, - struct btrfs_root *root) -{ - LIST_HEAD(inode_list); - struct btrfs_key key; - struct btrfs_key next_key; - struct btrfs_trans_handle *trans; - struct btrfs_root *reloc_root; - struct btrfs_root_item *root_item; - struct btrfs_path *path; - struct extent_buffer *leaf; - unsigned long nr; - int level; - int max_level; - int replaced = 0; - int ret; - int err = 0; - u32 min_reserved; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 1; - - reloc_root = root->reloc_root; - root_item = &reloc_root->root_item; - - if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - level = btrfs_root_level(root_item); - extent_buffer_get(reloc_root->node); - path->nodes[level] = reloc_root->node; - path->slots[level] = 0; - } else { - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); - - level = root_item->drop_level; - BUG_ON(level == 0); - path->lowest_level = level; - ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); - path->lowest_level = 0; - if (ret < 0) { - btrfs_free_path(path); - return ret; - } - - btrfs_node_key_to_cpu(path->nodes[level], &next_key, - path->slots[level]); - WARN_ON(memcmp(&key, &next_key, sizeof(key))); - - btrfs_unlock_up_safe(path, 0); - } - - min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; - memset(&next_key, 0, sizeof(next_key)); - - while (1) { - trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = rc->block_rsv; - - ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); - if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; - } - - replaced = 0; - max_level = level; - - ret = walk_down_reloc_tree(reloc_root, path, &level); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) - break; - - if (!find_next_key(path, level, &key) && - btrfs_comp_cpu_keys(&next_key, &key) >= 0) { - ret = 0; - } else { - ret = replace_path(trans, root, reloc_root, path, - &next_key, level, max_level); - } - if (ret < 0) { - err = ret; - goto out; - } - - if (ret > 0) { - level = ret; - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - replaced = 1; - } - - ret = walk_up_reloc_tree(reloc_root, path, &level); - if (ret > 0) - break; - - BUG_ON(level == 0); - /* - * save the merging progress in the drop_progress. - * this is OK since root refs == 1 in this case. - */ - btrfs_node_key(path->nodes[level], &root_item->drop_progress, - path->slots[level]); - root_item->drop_level = level; - - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); - - btrfs_btree_balance_dirty(root, nr); - - if (replaced && rc->stage == UPDATE_DATA_PTRS) - invalidate_extent_cache(root, &key, &next_key); - } - - /* - * handle the case only one block in the fs tree need to be - * relocated and the block is tree root. - */ - leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - if (ret < 0) - err = ret; -out: - btrfs_free_path(path); - - if (err == 0) { - memset(&root_item->drop_progress, 0, - sizeof(root_item->drop_progress)); - root_item->drop_level = 0; - btrfs_set_root_refs(root_item, 0); - btrfs_update_reloc_root(trans, root); - } - - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, root); - - btrfs_btree_balance_dirty(root, nr); - - if (replaced && rc->stage == UPDATE_DATA_PTRS) - invalidate_extent_cache(root, &key, &next_key); - - return err; -} - -static noinline_for_stack -int prepare_to_merge(struct reloc_control *rc, int err) -{ - struct btrfs_root *root = rc->extent_root; - struct btrfs_root *reloc_root; - struct btrfs_trans_handle *trans; - LIST_HEAD(reloc_roots); - u64 num_bytes = 0; - int ret; - - mutex_lock(&root->fs_info->reloc_mutex); - rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; - rc->merging_rsv_size += rc->nodes_relocated * 2; - mutex_unlock(&root->fs_info->reloc_mutex); - -again: - if (!err) { - num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); - if (ret) - err = ret; - } - - trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) { - if (!err) - btrfs_block_rsv_release(rc->extent_root, - rc->block_rsv, num_bytes); - return PTR_ERR(trans); - } - - if (!err) { - if (num_bytes != rc->merging_rsv_size) { - btrfs_end_transaction(trans, rc->extent_root); - btrfs_block_rsv_release(rc->extent_root, - rc->block_rsv, num_bytes); - goto again; - } - } - - rc->merge_reloc_tree = 1; - - while (!list_empty(&rc->reloc_roots)) { - reloc_root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); - list_del_init(&reloc_root->root_list); - - root = read_fs_root(reloc_root->fs_info, - reloc_root->root_key.offset); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); - - /* - * set reference count to 1, so btrfs_recover_relocation - * knows it should resumes merging - */ - if (!err) - btrfs_set_root_refs(&reloc_root->root_item, 1); - btrfs_update_reloc_root(trans, root); - - list_add(&reloc_root->root_list, &reloc_roots); - } - - list_splice(&reloc_roots, &rc->reloc_roots); - - if (!err) - btrfs_commit_transaction(trans, rc->extent_root); - else - btrfs_end_transaction(trans, rc->extent_root); - return err; -} - -static noinline_for_stack -int merge_reloc_roots(struct reloc_control *rc) -{ - struct btrfs_root *root; - struct btrfs_root *reloc_root; - LIST_HEAD(reloc_roots); - int found = 0; - int ret; -again: - root = rc->extent_root; - - /* - * this serializes us with btrfs_record_root_in_transaction, - * we have to make sure nobody is in the middle of - * adding their roots to the list while we are - * doing this splice - */ - mutex_lock(&root->fs_info->reloc_mutex); - list_splice_init(&rc->reloc_roots, &reloc_roots); - mutex_unlock(&root->fs_info->reloc_mutex); - - while (!list_empty(&reloc_roots)) { - found = 1; - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - - if (btrfs_root_refs(&reloc_root->root_item) > 0) { - root = read_fs_root(reloc_root->fs_info, - reloc_root->root_key.offset); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); - - ret = merge_reloc_root(rc, root); - BUG_ON(ret); - } else { - list_del_init(&reloc_root->root_list); - } - ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); - BUG_ON(ret < 0); - } - - if (found) { - found = 0; - goto again; - } - BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return 0; -} - -static void free_block_list(struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - while ((rb_node = rb_first(blocks))) { - block = rb_entry(rb_node, struct tree_block, rb_node); - rb_erase(rb_node, blocks); - kfree(block); - } -} - -static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *reloc_root) -{ - struct btrfs_root *root; - - if (reloc_root->last_trans == trans->transid) - return 0; - - root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); - - return btrfs_record_root_in_trans(trans, root); -} - -static noinline_for_stack -struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct backref_edge *edges[], int *nr) -{ - struct backref_node *next; - struct btrfs_root *root; - int index = 0; - - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; - BUG_ON(!root); - BUG_ON(!root->ref_cows); - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - record_reloc_root_in_trans(trans, root); - break; - } - - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - - if (next->new_bytenr != root->node->start) { - BUG_ON(next->new_bytenr); - BUG_ON(!list_empty(&next->list)); - next->new_bytenr = root->node->start; - next->root = root; - list_add_tail(&next->list, - &rc->backref_cache.changed); - __mark_block_processed(rc, next); - break; - } - - WARN_ON(1); - root = NULL; - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - if (!root) - return NULL; - - *nr = index; - next = node; - /* setup backref node path for btrfs_reloc_cow_block */ - while (1) { - rc->backref_cache.path[next->level] = next; - if (--index < 0) - break; - next = edges[index]->node[UPPER]; - } - return root; -} - -/* - * select a tree root for relocation. return NULL if the block - * is reference counted. we should use do_relocation() in this - * case. return a tree root pointer if the block isn't reference - * counted. return -ENOENT if the block is root of reloc tree. - */ -static noinline_for_stack -struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node) -{ - struct backref_node *next; - struct btrfs_root *root; - struct btrfs_root *fs_root = NULL; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int index = 0; - - next = node; - while (1) { - cond_resched(); - next = walk_up_backref(next, edges, &index); - root = next->root; - BUG_ON(!root); - - /* no other choice for non-references counted tree */ - if (!root->ref_cows) - return root; - - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) - fs_root = root; - - if (next != node) - return NULL; - - next = walk_down_backref(edges, &index); - if (!next || next->level <= node->level) - break; - } - - if (!fs_root) - return ERR_PTR(-ENOENT); - return fs_root; -} - -static noinline_for_stack -u64 calcu_metadata_size(struct reloc_control *rc, - struct backref_node *node, int reserve) -{ - struct backref_node *next = node; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - u64 num_bytes = 0; - int index = 0; - - BUG_ON(reserve && node->processed); - - while (next) { - cond_resched(); - while (1) { - if (next->processed && (reserve || next != node)) - break; - - num_bytes += btrfs_level_size(rc->extent_root, - next->level); - - if (list_empty(&next->upper)) - break; - - edge = list_entry(next->upper.next, - struct backref_edge, list[LOWER]); - edges[index++] = edge; - next = edge->node[UPPER]; - } - next = walk_down_backref(edges, &index); - } - return num_bytes; -} - -static int reserve_metadata_space(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node) -{ - struct btrfs_root *root = rc->extent_root; - u64 num_bytes; - int ret; - - num_bytes = calcu_metadata_size(rc, node, 1) * 2; - - trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); - if (ret) { - if (ret == -EAGAIN) - rc->commit_transaction = 1; - return ret; - } - - return 0; -} - -static void release_metadata_space(struct reloc_control *rc, - struct backref_node *node) -{ - u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); -} - -/* - * relocate a block tree, and then update pointers in upper level - * blocks that reference the block to point to the new location. - * - * if called by link_to_upper, the block has already been relocated. - * in that case this function just updates pointers. - */ -static int do_relocation(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct btrfs_key *key, - struct btrfs_path *path, int lowest) -{ - struct backref_node *upper; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - struct btrfs_root *root; - struct extent_buffer *eb; - u32 blocksize; - u64 bytenr; - u64 generation; - int nr; - int slot; - int ret; - int err = 0; - - BUG_ON(lowest && node->eb); - - path->lowest_level = node->level + 1; - rc->backref_cache.path[node->level] = node; - list_for_each_entry(edge, &node->upper, list[LOWER]) { - cond_resched(); - - upper = edge->node[UPPER]; - root = select_reloc_root(trans, rc, upper, edges, &nr); - BUG_ON(!root); - - if (upper->eb && !upper->locked) { - if (!lowest) { - ret = btrfs_bin_search(upper->eb, key, - upper->level, &slot); - BUG_ON(ret); - bytenr = btrfs_node_blockptr(upper->eb, slot); - if (node->eb->start == bytenr) - goto next; - } - drop_node_buffer(upper); - } - - if (!upper->eb) { - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - err = ret; - break; - } - BUG_ON(ret > 0); - - if (!upper->eb) { - upper->eb = path->nodes[upper->level]; - path->nodes[upper->level] = NULL; - } else { - BUG_ON(upper->eb != path->nodes[upper->level]); - } - - upper->locked = 1; - path->locks[upper->level] = 0; - - slot = path->slots[upper->level]; - btrfs_release_path(path); - } else { - ret = btrfs_bin_search(upper->eb, key, upper->level, - &slot); - BUG_ON(ret); - } - - bytenr = btrfs_node_blockptr(upper->eb, slot); - if (lowest) { - BUG_ON(bytenr != node->bytenr); - } else { - if (node->eb->start == bytenr) - goto next; - } - - blocksize = btrfs_level_size(root, node->level); - generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(root, bytenr, blocksize, generation); - if (!eb) { - err = -EIO; - goto next; - } - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - - if (!node->eb) { - ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - if (ret < 0) { - err = ret; - goto next; - } - BUG_ON(node->eb != eb); - } else { - btrfs_set_node_blockptr(upper->eb, slot, - node->eb->start); - btrfs_set_node_ptr_generation(upper->eb, slot, - trans->transid); - btrfs_mark_buffer_dirty(upper->eb); - - ret = btrfs_inc_extent_ref(trans, root, - node->eb->start, blocksize, - upper->eb->start, - btrfs_header_owner(upper->eb), - node->level, 0, 1); - BUG_ON(ret); - - ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - BUG_ON(ret); - } -next: - if (!upper->pending) - drop_node_buffer(upper); - else - unlock_node_buffer(upper); - if (err) - break; - } - - if (!err && node->pending) { - drop_node_buffer(node); - list_move_tail(&node->list, &rc->backref_cache.changed); - node->pending = 0; - } - - path->lowest_level = 0; - BUG_ON(err == -ENOSPC); - return err; -} - -static int link_to_upper(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct btrfs_path *path) -{ - struct btrfs_key key; - - btrfs_node_key_to_cpu(node->eb, &key, 0); - return do_relocation(trans, rc, node, &key, path, 0); -} - -static int finish_pending_nodes(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_path *path, int err) -{ - LIST_HEAD(list); - struct backref_cache *cache = &rc->backref_cache; - struct backref_node *node; - int level; - int ret; - - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { - while (!list_empty(&cache->pending[level])) { - node = list_entry(cache->pending[level].next, - struct backref_node, list); - list_move_tail(&node->list, &list); - BUG_ON(!node->pending); - - if (!err) { - ret = link_to_upper(trans, rc, node, path); - if (ret < 0) - err = ret; - } - } - list_splice_init(&list, &cache->pending[level]); - } - return err; -} - -static void mark_block_processed(struct reloc_control *rc, - u64 bytenr, u32 blocksize) -{ - set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY, GFP_NOFS); -} - -static void __mark_block_processed(struct reloc_control *rc, - struct backref_node *node) -{ - u32 blocksize; - if (node->level == 0 || - in_block_group(node->bytenr, rc->block_group)) { - blocksize = btrfs_level_size(rc->extent_root, node->level); - mark_block_processed(rc, node->bytenr, blocksize); - } - node->processed = 1; -} - -/* - * mark a block and all blocks directly/indirectly reference the block - * as processed. - */ -static void update_processed_blocks(struct reloc_control *rc, - struct backref_node *node) -{ - struct backref_node *next = node; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int index = 0; - - while (next) { - cond_resched(); - while (1) { - if (next->processed) - break; - - __mark_block_processed(rc, next); - - if (list_empty(&next->upper)) - break; - - edge = list_entry(next->upper.next, - struct backref_edge, list[LOWER]); - edges[index++] = edge; - next = edge->node[UPPER]; - } - next = walk_down_backref(edges, &index); - } -} - -static int tree_block_processed(u64 bytenr, u32 blocksize, - struct reloc_control *rc) -{ - if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) - return 1; - return 0; -} - -static int get_tree_block_key(struct reloc_control *rc, - struct tree_block *block) -{ - struct extent_buffer *eb; - - BUG_ON(block->key_ready); - eb = read_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); - BUG_ON(!eb); - WARN_ON(btrfs_header_level(eb) != block->level); - if (block->level == 0) - btrfs_item_key_to_cpu(eb, &block->key, 0); - else - btrfs_node_key_to_cpu(eb, &block->key, 0); - free_extent_buffer(eb); - block->key_ready = 1; - return 0; -} - -static int reada_tree_block(struct reloc_control *rc, - struct tree_block *block) -{ - BUG_ON(block->key_ready); - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); - return 0; -} - -/* - * helper function to relocate a tree block - */ -static int relocate_tree_block(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct btrfs_key *key, - struct btrfs_path *path) -{ - struct btrfs_root *root; - int release = 0; - int ret = 0; - - if (!node) - return 0; - - BUG_ON(node->processed); - root = select_one_root(trans, node); - if (root == ERR_PTR(-ENOENT)) { - update_processed_blocks(rc, node); - goto out; - } - - if (!root || root->ref_cows) { - ret = reserve_metadata_space(trans, rc, node); - if (ret) - goto out; - release = 1; - } - - if (root) { - if (root->ref_cows) { - BUG_ON(node->new_bytenr); - BUG_ON(!list_empty(&node->list)); - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - node->new_bytenr = root->node->start; - node->root = root; - list_add_tail(&node->list, &rc->backref_cache.changed); - } else { - path->lowest_level = node->level; - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(path); - if (ret > 0) - ret = 0; - } - if (!ret) - update_processed_blocks(rc, node); - } else { - ret = do_relocation(trans, rc, node, key, path, 1); - } -out: - if (ret || node->level == 0 || node->cowonly) { - if (release) - release_metadata_space(rc, node); - remove_backref_node(&rc->backref_cache, node); - } - return ret; -} - -/* - * relocate a list of blocks - */ -static noinline_for_stack -int relocate_tree_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct rb_root *blocks) -{ - struct backref_node *node; - struct btrfs_path *path; - struct tree_block *block; - struct rb_node *rb_node; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (!block->key_ready) - reada_tree_block(rc, block); - rb_node = rb_next(rb_node); - } - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (!block->key_ready) - get_tree_block_key(rc, block); - rb_node = rb_next(rb_node); - } - - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - - node = build_backref_tree(rc, &block->key, - block->level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } - - ret = relocate_tree_block(trans, rc, node, &block->key, - path); - if (ret < 0) { - if (ret != -EAGAIN || rb_node == rb_first(blocks)) - err = ret; - goto out; - } - rb_node = rb_next(rb_node); - } -out: - free_block_list(blocks); - err = finish_pending_nodes(trans, rc, path, err); - - btrfs_free_path(path); - return err; -} - -static noinline_for_stack -int prealloc_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) -{ - u64 alloc_hint = 0; - u64 start; - u64 end; - u64 offset = BTRFS_I(inode)->index_cnt; - u64 num_bytes; - int nr = 0; - int ret = 0; - - BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); - - ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start); - if (ret) - goto out; - - while (nr < cluster->nr) { - start = cluster->boundary[nr] - offset; - if (nr + 1 < cluster->nr) - end = cluster->boundary[nr + 1] - 1 - offset; - else - end = cluster->end - offset; - - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - num_bytes = end + 1 - start; - ret = btrfs_prealloc_file_range(inode, 0, start, - num_bytes, num_bytes, - end + 1, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); - if (ret) - break; - nr++; - } - btrfs_free_reserved_data_space(inode, cluster->end + - 1 - cluster->start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - -static noinline_for_stack -int setup_extent_mapping(struct inode *inode, u64 start, u64 end, - u64 block_start) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - int ret = 0; - - em = alloc_extent_map(); - if (!em) - return -ENOMEM; - - em->start = start; - em->len = end + 1 - start; - em->block_len = em->len; - em->block_start = block_start; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - - lock_extent(&BTRFS_I(inode)->io_tree, start, end); - while (1) { - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, end, 0); - } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); - return ret; -} - -static int relocate_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) -{ - u64 page_start; - u64 page_end; - u64 offset = BTRFS_I(inode)->index_cnt; - unsigned long index; - unsigned long last_index; - struct page *page; - struct file_ra_state *ra; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - int nr = 0; - int ret = 0; - - if (!cluster->nr) - return 0; - - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return -ENOMEM; - - ret = prealloc_file_extent_cluster(inode, cluster); - if (ret) - goto out; - - file_ra_state_init(ra, inode->i_mapping); - - ret = setup_extent_mapping(inode, cluster->start - offset, - cluster->end - offset, cluster->start); - if (ret) - goto out; - - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; - while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - page = find_lock_page(inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(inode->i_mapping, - ra, NULL, index, - last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - mask); - if (!page) { - btrfs_delalloc_release_metadata(inode, - PAGE_CACHE_SIZE); - ret = -ENOMEM; - goto out; - } - } - - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, - ra, NULL, page, index, - last_index + 1 - index); - } - - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - btrfs_delalloc_release_metadata(inode, - PAGE_CACHE_SIZE); - ret = -EIO; - goto out; - } - } - - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - - lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - - set_page_extent_mapped(page); - - if (nr < cluster->nr && - page_start + offset == cluster->boundary[nr]) { - set_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); - nr++; - } - - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); - set_page_dirty(page); - - unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end); - unlock_page(page); - page_cache_release(page); - - index++; - balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_throttle(BTRFS_I(inode)->root); - } - WARN_ON(nr != cluster->nr); -out: - kfree(ra); - return ret; -} - -static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, - struct file_extent_cluster *cluster) -{ - int ret; - - if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { - ret = relocate_file_extent_cluster(inode, cluster); - if (ret) - return ret; - cluster->nr = 0; - } - - if (!cluster->nr) - cluster->start = extent_key->objectid; - else - BUG_ON(cluster->nr >= MAX_EXTENTS); - cluster->end = extent_key->objectid + extent_key->offset - 1; - cluster->boundary[cluster->nr] = extent_key->objectid; - cluster->nr++; - - if (cluster->nr >= MAX_EXTENTS) { - ret = relocate_file_extent_cluster(inode, cluster); - if (ret) - return ret; - cluster->nr = 0; - } - return 0; -} - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -static int get_ref_objectid_v0(struct reloc_control *rc, - struct btrfs_path *path, - struct btrfs_key *extent_key, - u64 *ref_objectid, int *path_change) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - struct btrfs_extent_ref_v0 *ref0; - int ret; - int slot; - - leaf = path->nodes[0]; - slot = path->slots[0]; - while (1) { - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) - return ret; - BUG_ON(ret > 0); - leaf = path->nodes[0]; - slot = path->slots[0]; - if (path_change) - *path_change = 1; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != extent_key->objectid) - return -ENOENT; - - if (key.type != BTRFS_EXTENT_REF_V0_KEY) { - slot++; - continue; - } - ref0 = btrfs_item_ptr(leaf, slot, - struct btrfs_extent_ref_v0); - *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); - break; - } - return 0; -} -#endif - -/* - * helper to add a tree block to the list. - * the major work is getting the generation and level of the block - */ -static int add_tree_block(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) -{ - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct btrfs_tree_block_info *bi; - struct tree_block *block; - struct rb_node *rb_node; - u32 item_size; - int level = -1; - int generation; - - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, path->slots[0]); - - if (item_size >= sizeof(*ei) + sizeof(*bi)) { - ei = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_item); - bi = (struct btrfs_tree_block_info *)(ei + 1); - generation = btrfs_extent_generation(eb, ei); - level = btrfs_tree_block_level(eb, bi); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - u64 ref_owner; - int ret; - - BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); - ret = get_ref_objectid_v0(rc, path, extent_key, - &ref_owner, NULL); - if (ret < 0) - return ret; - BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); - level = (int)ref_owner; - /* FIXME: get real generation */ - generation = 0; -#else - BUG(); -#endif - } - - btrfs_release_path(path); - - BUG_ON(level == -1); - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) - return -ENOMEM; - - block->bytenr = extent_key->objectid; - block->key.objectid = extent_key->offset; - block->key.offset = generation; - block->level = level; - block->key_ready = 0; - - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, block->bytenr); - - return 0; -} - -/* - * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY - */ -static int __add_tree_block(struct reloc_control *rc, - u64 bytenr, u32 blocksize, - struct rb_root *blocks) -{ - struct btrfs_path *path; - struct btrfs_key key; - int ret; - - if (tree_block_processed(bytenr, blocksize, rc)) - return 0; - - if (tree_search(blocks, bytenr)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = blocksize; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); - if (ret < 0) - goto out; - BUG_ON(ret); - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = add_tree_block(rc, &key, path, blocks); -out: - btrfs_free_path(path); - return ret; -} - -/* - * helper to check if the block use full backrefs for pointers in it - */ -static int block_use_full_backref(struct reloc_control *rc, - struct extent_buffer *eb) -{ - u64 flags; - int ret; - - if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || - btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) - return 1; - - ret = btrfs_lookup_extent_info(NULL, rc->extent_root, - eb->start, eb->len, NULL, &flags); - BUG_ON(ret); - - if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = 1; - else - ret = 0; - return ret; -} - -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct inode *inode, u64 ino) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_trans_handle *trans; - unsigned long nr; - int ret = 0; - - if (inode) - goto truncate; - - key.objectid = ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root, NULL); - if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { - if (inode && !IS_ERR(inode)) - iput(inode); - return -ENOENT; - } - -truncate: - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - btrfs_free_path(path); - ret = PTR_ERR(trans); - goto out; - } - - ret = btrfs_truncate_free_space_cache(root, trans, path, inode); - - btrfs_free_path(path); - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); -out: - iput(inode); - return ret; -} - -/* - * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY - * this function scans fs tree to find blocks reference the data extent - */ -static int find_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - struct rb_root *blocks) -{ - struct btrfs_path *path; - struct tree_block *block; - struct btrfs_root *root; - struct btrfs_file_extent_item *fi; - struct rb_node *rb_node; - struct btrfs_key key; - u64 ref_root; - u64 ref_objectid; - u64 ref_offset; - u32 ref_count; - u32 nritems; - int err = 0; - int added = 0; - int counted; - int ret; - - ref_root = btrfs_extent_data_ref_root(leaf, ref); - ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); - ref_offset = btrfs_extent_data_ref_offset(leaf, ref); - ref_count = btrfs_extent_data_ref_count(leaf, ref); - - /* - * This is an extent belonging to the free space cache, lets just delete - * it and redo the search. - */ - if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { - ret = delete_block_group_cache(rc->extent_root->fs_info, - NULL, ref_objectid); - if (ret != -ENOENT) - return ret; - ret = 0; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 1; - - root = read_fs_root(rc->extent_root->fs_info, ref_root); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - key.objectid = ref_objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - if (ref_offset > ((u64)-1 << 32)) - key.offset = 0; - else - key.offset = ref_offset; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - /* - * the references in tree blocks that use full backrefs - * are not counted in - */ - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - - while (ref_count > 0) { - while (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - WARN_ON(1); - goto out; - } - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - added = 0; - - if (block_use_full_backref(rc, leaf)) - counted = 0; - else - counted = 1; - rb_node = tree_search(blocks, leaf->start); - if (rb_node) { - if (counted) - added = 1; - else - path->slots[0] = nritems; - } - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != ref_objectid || - key.type != BTRFS_EXTENT_DATA_KEY) { - WARN_ON(1); - break; - } - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - goto next; - - if (btrfs_file_extent_disk_bytenr(leaf, fi) != - extent_key->objectid) - goto next; - - key.offset -= btrfs_file_extent_offset(leaf, fi); - if (key.offset != ref_offset) - goto next; - - if (counted) - ref_count--; - if (added) - goto next; - - if (!tree_block_processed(leaf->start, leaf->len, rc)) { - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = leaf->start; - btrfs_item_key_to_cpu(leaf, &block->key, 0); - block->level = 0; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, - &block->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, - block->bytenr); - } - if (counted) - added = 1; - else - path->slots[0] = nritems; -next: - path->slots[0]++; - - } -out: - btrfs_free_path(path); - return err; -} - -/* - * hepler to find all tree blocks that reference a given data extent - */ -static noinline_for_stack -int add_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) -{ - struct btrfs_key key; - struct extent_buffer *eb; - struct btrfs_extent_data_ref *dref; - struct btrfs_extent_inline_ref *iref; - unsigned long ptr; - unsigned long end; - u32 blocksize = btrfs_level_size(rc->extent_root, 0); - int ret; - int err = 0; - - eb = path->nodes[0]; - ptr = btrfs_item_ptr_offset(eb, path->slots[0]); - end = ptr + btrfs_item_size_nr(eb, path->slots[0]); -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (ptr + sizeof(struct btrfs_extent_item_v0) == end) - ptr = end; - else -#endif - ptr += sizeof(struct btrfs_extent_item); - - while (ptr < end) { - iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_extent_inline_ref_type(eb, iref); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = (struct btrfs_extent_data_ref *)(&iref->offset); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - BUG(); - } - ptr += btrfs_extent_inline_ref_size(key.type); - } - WARN_ON(ptr > end); - - while (1) { - cond_resched(); - eb = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret < 0) { - err = ret; - break; - } - if (ret > 0) - break; - eb = path->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - if (key.objectid != extent_key->objectid) - break; - -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (key.type == BTRFS_SHARED_DATA_REF_KEY || - key.type == BTRFS_EXTENT_REF_V0_KEY) { -#else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); - if (key.type == BTRFS_SHARED_DATA_REF_KEY) { -#endif - ret = __add_tree_block(rc, key.offset, blocksize, - blocks); - } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { - dref = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_extent_data_ref); - ret = find_data_references(rc, extent_key, - eb, dref, blocks); - } else { - ret = 0; - } - if (ret) { - err = ret; - break; - } - path->slots[0]++; - } - btrfs_release_path(path); - if (err) - free_block_list(blocks); - return err; -} - -/* - * hepler to find next unprocessed extent - */ -static noinline_for_stack -int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path, - struct btrfs_key *extent_key) -{ - struct btrfs_key key; - struct extent_buffer *leaf; - u64 start, end, last; - int ret; - - last = rc->block_group->key.objectid + rc->block_group->key.offset; - while (1) { - cond_resched(); - if (rc->search_start >= last) { - ret = 1; - break; - } - - key.objectid = rc->search_start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = 0; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, - 0, 0); - if (ret < 0) - break; -next: - leaf = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(rc->extent_root, path); - if (ret != 0) - break; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid >= last) { - ret = 1; - break; - } - - if (key.type != BTRFS_EXTENT_ITEM_KEY || - key.objectid + key.offset <= rc->search_start) { - path->slots[0]++; - goto next; - } - - ret = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY); - - if (ret == 0 && start <= key.objectid) { - btrfs_release_path(path); - rc->search_start = end + 1; - } else { - rc->search_start = key.objectid + key.offset; - memcpy(extent_key, &key, sizeof(key)); - return 0; - } - } - btrfs_release_path(path); - return ret; -} - -static void set_reloc_control(struct reloc_control *rc) -{ - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - - mutex_lock(&fs_info->reloc_mutex); - fs_info->reloc_ctl = rc; - mutex_unlock(&fs_info->reloc_mutex); -} - -static void unset_reloc_control(struct reloc_control *rc) -{ - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - - mutex_lock(&fs_info->reloc_mutex); - fs_info->reloc_ctl = NULL; - mutex_unlock(&fs_info->reloc_mutex); -} - -static int check_extent_flags(u64 flags) -{ - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if (!(flags & BTRFS_EXTENT_FLAG_DATA) && - !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - return 1; - return 0; -} - -static noinline_for_stack -int prepare_to_relocate(struct reloc_control *rc) -{ - struct btrfs_trans_handle *trans; - int ret; - - rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); - if (!rc->block_rsv) - return -ENOMEM; - - /* - * reserve some space for creating reloc trees. - * btrfs_init_reloc_root will use them when there - * is no reservation in transaction handle. - */ - ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256); - if (ret) - return ret; - - memset(&rc->cluster, 0, sizeof(rc->cluster)); - rc->search_start = rc->block_group->key.objectid; - rc->extents_found = 0; - rc->nodes_relocated = 0; - rc->merging_rsv_size = 0; - - rc->create_reloc_tree = 1; - set_reloc_control(rc); - - trans = btrfs_join_transaction(rc->extent_root); - BUG_ON(IS_ERR(trans)); - btrfs_commit_transaction(trans, rc->extent_root); - return 0; -} - -static noinline_for_stack int relocate_block_group(struct reloc_control *rc) -{ - struct rb_root blocks = RB_ROOT; - struct btrfs_key key; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - unsigned long nr; - u64 flags; - u32 item_size; - int ret; - int err = 0; - int progress = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 1; - - ret = prepare_to_relocate(rc); - if (ret) { - err = ret; - goto out_free; - } - - while (1) { - progress++; - trans = btrfs_start_transaction(rc->extent_root, 0); - BUG_ON(IS_ERR(trans)); -restart: - if (update_backref_cache(trans, &rc->backref_cache)) { - btrfs_end_transaction(trans, rc->extent_root); - continue; - } - - ret = find_next_extent(trans, rc, path, &key); - if (ret < 0) - err = ret; - if (ret != 0) - break; - - rc->extents_found++; - - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (item_size >= sizeof(*ei)) { - flags = btrfs_extent_flags(path->nodes[0], ei); - ret = check_extent_flags(flags); - BUG_ON(ret); - - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - u64 ref_owner; - int path_change = 0; - - BUG_ON(item_size != - sizeof(struct btrfs_extent_item_v0)); - ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, - &path_change); - if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) - flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; - else - flags = BTRFS_EXTENT_FLAG_DATA; - - if (path_change) { - btrfs_release_path(path); - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); - if (ret < 0) { - err = ret; - break; - } - BUG_ON(ret > 0); - } -#else - BUG(); -#endif - } - - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = add_tree_block(rc, &key, path, &blocks); - } else if (rc->stage == UPDATE_DATA_PTRS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { - ret = add_data_references(rc, &key, path, &blocks); - } else { - btrfs_release_path(path); - ret = 0; - } - if (ret < 0) { - err = ret; - break; - } - - if (!RB_EMPTY_ROOT(&blocks)) { - ret = relocate_tree_blocks(trans, rc, &blocks); - if (ret < 0) { - if (ret != -EAGAIN) { - err = ret; - break; - } - rc->extents_found--; - rc->search_start = key.objectid; - } - } - - ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); - if (ret < 0) { - if (ret != -ENOSPC) { - err = ret; - WARN_ON(1); - break; - } - rc->commit_transaction = 1; - } - - if (rc->commit_transaction) { - rc->commit_transaction = 0; - ret = btrfs_commit_transaction(trans, rc->extent_root); - BUG_ON(ret); - } else { - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); - } - trans = NULL; - - if (rc->stage == MOVE_DATA_EXTENTS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { - rc->found_file_extent = 1; - ret = relocate_data_extent(rc->data_inode, - &key, &rc->cluster); - if (ret < 0) { - err = ret; - break; - } - } - } - if (trans && progress && err == -ENOSPC) { - ret = btrfs_force_chunk_alloc(trans, rc->extent_root, - rc->block_group->flags); - if (ret == 0) { - err = 0; - progress = 0; - goto restart; - } - } - - btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); - - if (trans) { - nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root, nr); - } - - if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, - &rc->cluster); - if (ret < 0) - err = ret; - } - - rc->create_reloc_tree = 0; - set_reloc_control(rc); - - backref_cache_cleanup(&rc->backref_cache); - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); - - err = prepare_to_merge(rc, err); - - merge_reloc_roots(rc); - - rc->merge_reloc_tree = 0; - unset_reloc_control(rc); - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); - - /* get rid of pinned extents */ - trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) - err = PTR_ERR(trans); - else - btrfs_commit_transaction(trans, rc->extent_root); -out_free: - btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); - btrfs_free_path(path); - return err; -} - -static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid) -{ - struct btrfs_path *path; - struct btrfs_inode_item *item; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_insert_empty_inode(trans, root, path, objectid); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, 0); - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); -out: - btrfs_free_path(path); - return ret; -} - -/* - * helper to create inode for data relocation. - * the inode is in data relocation tree and its link count is 0 - */ -static noinline_for_stack -struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) -{ - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key key; - unsigned long nr; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; - int err = 0; - - root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(root)) - return ERR_CAST(root); - - trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) - return ERR_CAST(trans); - - err = btrfs_find_free_objectid(root, &objectid); - if (err) - goto out; - - err = __insert_orphan_inode(trans, root, objectid); - BUG_ON(err); - - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); - BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); - BTRFS_I(inode)->index_cnt = group->key.objectid; - - err = btrfs_orphan_add(trans, inode); -out: - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - if (err) { - if (inode) - iput(inode); - inode = ERR_PTR(err); - } - return inode; -} - -static struct reloc_control *alloc_reloc_control(void) -{ - struct reloc_control *rc; - - rc = kzalloc(sizeof(*rc), GFP_NOFS); - if (!rc) - return NULL; - - INIT_LIST_HEAD(&rc->reloc_roots); - backref_cache_init(&rc->backref_cache); - mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); - return rc; -} - -/* - * function to relocate all extents in a block group. - */ -int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) -{ - struct btrfs_fs_info *fs_info = extent_root->fs_info; - struct reloc_control *rc; - struct inode *inode; - struct btrfs_path *path; - int ret; - int rw = 0; - int err = 0; - - rc = alloc_reloc_control(); - if (!rc) - return -ENOMEM; - - rc->extent_root = extent_root; - - rc->block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!rc->block_group); - - if (!rc->block_group->ro) { - ret = btrfs_set_block_group_ro(extent_root, rc->block_group); - if (ret) { - err = ret; - goto out; - } - rw = 1; - } - - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out; - } - - inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, - path); - btrfs_free_path(path); - - if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, inode, 0); - else - ret = PTR_ERR(inode); - - if (ret && ret != -ENOENT) { - err = ret; - goto out; - } - - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - goto out; - } - - printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", - (unsigned long long)rc->block_group->key.objectid, - (unsigned long long)rc->block_group->flags); - - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); - btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); - - while (1) { - mutex_lock(&fs_info->cleaner_mutex); - - btrfs_clean_old_snapshots(fs_info->tree_root); - ret = relocate_block_group(rc); - - mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { - err = ret; - goto out; - } - - if (rc->extents_found == 0) - break; - - printk(KERN_INFO "btrfs: found %llu extents\n", - (unsigned long long)rc->extents_found); - - if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); - invalidate_mapping_pages(rc->data_inode->i_mapping, - 0, -1); - rc->stage = UPDATE_DATA_PTRS; - } - } - - filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); - - WARN_ON(rc->block_group->pinned > 0); - WARN_ON(rc->block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); -out: - if (err && rw) - btrfs_set_block_group_rw(extent_root, rc->block_group); - iput(rc->data_inode); - btrfs_put_block_group(rc->block_group); - kfree(rc); - return err; -} - -static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - int ret, err; - - trans = btrfs_start_transaction(root->fs_info->tree_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - memset(&root->root_item.drop_progress, 0, - sizeof(root->root_item.drop_progress)); - root->root_item.drop_level = 0; - btrfs_set_root_refs(&root->root_item, 0); - ret = btrfs_update_root(trans, root->fs_info->tree_root, - &root->root_key, &root->root_item); - - err = btrfs_end_transaction(trans, root->fs_info->tree_root); - if (err) - return err; - return ret; -} - -/* - * recover relocation interrupted by system crash. - * - * this function resumes merging reloc trees with corresponding fs trees. - * this is important for keeping the sharing of tree blocks - */ -int btrfs_recover_relocation(struct btrfs_root *root) -{ - LIST_HEAD(reloc_roots); - struct btrfs_key key; - struct btrfs_root *fs_root; - struct btrfs_root *reloc_root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct reloc_control *rc = NULL; - struct btrfs_trans_handle *trans; - int ret; - int err = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = -1; - - key.objectid = BTRFS_TREE_RELOC_OBJECTID; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - while (1) { - ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, - path, 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(path); - - if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || - key.type != BTRFS_ROOT_ITEM_KEY) - break; - - reloc_root = btrfs_read_fs_root_no_radix(root, &key); - if (IS_ERR(reloc_root)) { - err = PTR_ERR(reloc_root); - goto out; - } - - list_add(&reloc_root->root_list, &reloc_roots); - - if (btrfs_root_refs(&reloc_root->root_item) > 0) { - fs_root = read_fs_root(root->fs_info, - reloc_root->root_key.offset); - if (IS_ERR(fs_root)) { - ret = PTR_ERR(fs_root); - if (ret != -ENOENT) { - err = ret; - goto out; - } - ret = mark_garbage_root(reloc_root); - if (ret < 0) { - err = ret; - goto out; - } - } - } - - if (key.offset == 0) - break; - - key.offset--; - } - btrfs_release_path(path); - - if (list_empty(&reloc_roots)) - goto out; - - rc = alloc_reloc_control(); - if (!rc) { - err = -ENOMEM; - goto out; - } - - rc->extent_root = root->fs_info->extent_root; - - set_reloc_control(rc); - - trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) { - unset_reloc_control(rc); - err = PTR_ERR(trans); - goto out_free; - } - - rc->merge_reloc_tree = 1; - - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - - if (btrfs_root_refs(&reloc_root->root_item) == 0) { - list_add_tail(&reloc_root->root_list, - &rc->reloc_roots); - continue; - } - - fs_root = read_fs_root(root->fs_info, - reloc_root->root_key.offset); - if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); - goto out_free; - } - - err = __add_reloc_root(reloc_root); - BUG_ON(err < 0); /* -ENOMEM or logic error */ - fs_root->reloc_root = reloc_root; - } - - err = btrfs_commit_transaction(trans, rc->extent_root); - if (err) - goto out_free; - - merge_reloc_roots(rc); - - unset_reloc_control(rc); - - trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) - err = PTR_ERR(trans); - else - err = btrfs_commit_transaction(trans, rc->extent_root); -out_free: - kfree(rc); -out: - while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); - list_del(&reloc_root->root_list); - free_extent_buffer(reloc_root->node); - free_extent_buffer(reloc_root->commit_root); - kfree(reloc_root); - } - btrfs_free_path(path); - - if (err == 0) { - /* cleanup orphan inode in data relocation tree */ - fs_root = read_fs_root(root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(fs_root)) - err = PTR_ERR(fs_root); - else - err = btrfs_orphan_cleanup(fs_root); - } - return err; -} - -/* - * helper to add ordered checksum for data relocation. - * - * cloning checksum properly handles the nodatasum extents. - * it also saves CPU time to re-calculate the checksum. - */ -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) -{ - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - size_t offset; - int ret; - u64 disk_bytenr; - LIST_HEAD(list); - - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); - - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0); - if (ret) - goto out; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del_init(&sums->list); - - sector_sum = sums->sums; - sums->bytenr = ordered->start; - - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } - - btrfs_add_ordered_sum(inode, ordered, sums); - } -out: - btrfs_put_ordered_extent(ordered); - return ret; -} - -void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow) -{ - struct reloc_control *rc; - struct backref_node *node; - int first_cow = 0; - int level; - int ret; - - rc = root->fs_info->reloc_ctl; - if (!rc) - return; - - BUG_ON(rc->stage == UPDATE_DATA_PTRS && - root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); - - level = btrfs_header_level(buf); - if (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item)) - first_cow = 1; - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && - rc->create_reloc_tree) { - WARN_ON(!first_cow && level == 0); - - node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); - - drop_node_buffer(node); - extent_buffer_get(cow); - node->eb = cow; - node->new_bytenr = cow->start; - - if (!node->pending) { - list_move_tail(&node->list, - &rc->backref_cache.pending[level]); - node->pending = 1; - } - - if (first_cow) - __mark_block_processed(rc, node); - - if (first_cow && level > 0) - rc->nodes_relocated += buf->len; - } - - if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_file_extents(trans, rc, root, cow); - BUG_ON(ret); - } -} - -/* - * called before creating snapshot. it calculates metadata reservation - * requried for relocating tree blocks in the snapshot - */ -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve) -{ - struct btrfs_root *root; - struct reloc_control *rc; - - root = pending->root; - if (!root->reloc_root) - return; - - rc = root->fs_info->reloc_ctl; - if (!rc->merge_reloc_tree) - return; - - root = root->reloc_root; - BUG_ON(btrfs_root_refs(&root->root_item) == 0); - /* - * relocation is in the stage of merging trees. the space - * used by merging a reloc tree is twice the size of - * relocated tree nodes in the worst case. half for cowing - * the reloc tree, half for cowing the fs tree. the space - * used by cowing the reloc tree will be freed after the - * tree is dropped. if we create snapshot, cowing the fs - * tree may use more space than it frees. so we need - * reserve extra space. - */ - *bytes_to_reserve += rc->nodes_relocated; -} - -/* - * called after snapshot is created. migrate block reservation - * and create reloc root for the newly created snapshot - */ -int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_root *root = pending->root; - struct btrfs_root *reloc_root; - struct btrfs_root *new_root; - struct reloc_control *rc; - int ret; - - if (!root->reloc_root) - return 0; - - rc = root->fs_info->reloc_ctl; - rc->merging_rsv_size += rc->nodes_relocated; - - if (rc->merge_reloc_tree) { - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - rc->block_rsv, - rc->nodes_relocated); - if (ret) - return ret; - } - - new_root = pending->snap; - reloc_root = create_reloc_root(trans, root->reloc_root, - new_root->root_key.objectid); - if (IS_ERR(reloc_root)) - return PTR_ERR(reloc_root); - - ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); - new_root->reloc_root = reloc_root; - - if (rc->create_reloc_tree) - ret = clone_backref_node(trans, rc, root, reloc_root); - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/root-tree.c b/ANDROID_3.4.5/fs/btrfs/root-tree.c deleted file mode 100644 index 24fb8ce4..00000000 --- a/ANDROID_3.4.5/fs/btrfs/root-tree.c +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include "ctree.h" -#include "transaction.h" -#include "disk-io.h" -#include "print-tree.h" - -/* - * lookup the root with the highest offset for a given objectid. The key we do - * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 - * on error. - */ -int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, - struct btrfs_root_item *item, struct btrfs_key *key) -{ - struct btrfs_path *path; - struct btrfs_key search_key; - struct btrfs_key found_key; - struct extent_buffer *l; - int ret; - int slot; - - search_key.objectid = objectid; - search_key.type = BTRFS_ROOT_ITEM_KEY; - search_key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); - if (ret < 0) - goto out; - - BUG_ON(ret == 0); - if (path->slots[0] == 0) { - ret = 1; - goto out; - } - l = path->nodes[0]; - slot = path->slots[0] - 1; - btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid || - found_key.type != BTRFS_ROOT_ITEM_KEY) { - ret = 1; - goto out; - } - if (item) - read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), - sizeof(*item)); - if (key) - memcpy(key, &found_key, sizeof(found_key)); - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -void btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node) -{ - btrfs_set_root_bytenr(item, node->start); - btrfs_set_root_level(item, btrfs_header_level(node)); - btrfs_set_root_generation(item, btrfs_header_generation(node)); -} - -/* - * copy the data in 'item' into the btree - */ -int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_key *key, struct btrfs_root_item - *item) -{ - struct btrfs_path *path; - struct extent_buffer *l; - int ret; - int slot; - unsigned long ptr; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); - goto out; - } - - if (ret != 0) { - btrfs_print_leaf(root, path->nodes[0]); - printk(KERN_CRIT "unable to update root key %llu %u %llu\n", - (unsigned long long)key->objectid, key->type, - (unsigned long long)key->offset); - BUG_ON(1); - } - - l = path->nodes[0]; - slot = path->slots[0]; - ptr = btrfs_item_ptr_offset(l, slot); - write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(path->nodes[0]); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *key, struct btrfs_root_item *item) -{ - return btrfs_insert_item(trans, root, key, item, sizeof(*item)); -} - -/* - * at mount time we want to find all the old transaction snapshots that were in - * the process of being deleted if we crashed. This is any root item with an - * offset lower than the latest root. They need to be queued for deletion to - * finish what was happening when we crashed. - */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) -{ - struct btrfs_root *dead_root; - struct btrfs_root_item *ri; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - int ret; - u32 nritems; - struct extent_buffer *leaf; - int slot; - - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - key.offset = 0; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) - goto next; - - if (key.objectid < objectid) - goto next; - - if (key.objectid > objectid) - break; - - ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); - if (btrfs_disk_root_refs(leaf, ri) != 0) - goto next; - - memcpy(&found_key, &key, sizeof(key)); - key.offset++; - btrfs_release_path(path); - dead_root = - btrfs_read_fs_root_no_radix(root->fs_info->tree_root, - &found_key); - if (IS_ERR(dead_root)) { - ret = PTR_ERR(dead_root); - goto err; - } - - ret = btrfs_add_dead_root(dead_root); - if (ret) - goto err; - goto again; -next: - slot++; - path->slots[0]++; - } - ret = 0; -err: - btrfs_free_path(path); - return ret; -} - -int btrfs_find_orphan_roots(struct btrfs_root *tree_root) -{ - struct extent_buffer *leaf; - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_key root_key; - struct btrfs_root *root; - int err = 0; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = BTRFS_ORPHAN_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = 0; - - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - - while (1) { - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - if (ret < 0) { - err = ret; - break; - } - - leaf = path->nodes[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(tree_root, path); - if (ret < 0) - err = ret; - if (ret != 0) - break; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(path); - - if (key.objectid != BTRFS_ORPHAN_OBJECTID || - key.type != BTRFS_ORPHAN_ITEM_KEY) - break; - - root_key.objectid = key.offset; - key.offset++; - - root = btrfs_read_fs_root_no_name(tree_root->fs_info, - &root_key); - if (!IS_ERR(root)) - continue; - - ret = PTR_ERR(root); - if (ret != -ENOENT) { - err = ret; - break; - } - - ret = btrfs_find_dead_roots(tree_root, root_key.objectid); - if (ret) { - err = ret; - break; - } - } - - btrfs_free_path(path); - return err; -} - -/* drop the root item for 'key' from 'root' */ -int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_key *key) -{ - struct btrfs_path *path; - int ret; - struct btrfs_root_item *ri; - struct extent_buffer *leaf; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ret = btrfs_search_slot(trans, root, key, path, -1, 1); - if (ret < 0) - goto out; - - BUG_ON(ret != 0); - leaf = path->nodes[0]; - ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); - - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, - u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const char *name, int name_len) - -{ - struct btrfs_path *path; - struct btrfs_root_ref *ref; - struct extent_buffer *leaf; - struct btrfs_key key; - unsigned long ptr; - int err = 0; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = root_id; - key.type = BTRFS_ROOT_BACKREF_KEY; - key.offset = ref_id; -again: - ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - BUG_ON(ret < 0); - if (ret == 0) { - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_root_ref); - - WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); - WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); - ptr = (unsigned long)(ref + 1); - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); - *sequence = btrfs_root_ref_sequence(leaf, ref); - - ret = btrfs_del_item(trans, tree_root, path); - if (ret) { - err = ret; - goto out; - } - } else - err = -ENOENT; - - if (key.type == BTRFS_ROOT_BACKREF_KEY) { - btrfs_release_path(path); - key.objectid = ref_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = root_id; - goto again; - } - -out: - btrfs_free_path(path); - return err; -} - -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id) -{ - struct btrfs_key key; - int ret; - - key.objectid = root_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = ref_id; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - return ret; -} - -/* - * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY - * or BTRFS_ROOT_BACKREF_KEY. - * - * The dirid, sequence, name and name_len refer to the directory entry - * that is referencing the root. - * - * For a forward ref, the root_id is the id of the tree referencing - * the root and ref_id is the id of the subvol or snapshot. - * - * For a back ref the root_id is the id of the subvol or snapshot and - * ref_id is the id of the tree referencing it. - * - * Will return 0, -ENOMEM, or anything from the CoW path - */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, - u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const char *name, int name_len) -{ - struct btrfs_key key; - int ret; - struct btrfs_path *path; - struct btrfs_root_ref *ref; - struct extent_buffer *leaf; - unsigned long ptr; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = root_id; - key.type = BTRFS_ROOT_BACKREF_KEY; - key.offset = ref_id; -again: - ret = btrfs_insert_empty_item(trans, tree_root, path, &key, - sizeof(*ref) + name_len); - if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); - btrfs_free_path(path); - return ret; - } - - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - btrfs_set_root_ref_dirid(leaf, ref, dirid); - btrfs_set_root_ref_sequence(leaf, ref, sequence); - btrfs_set_root_ref_name_len(leaf, ref, name_len); - ptr = (unsigned long)(ref + 1); - write_extent_buffer(leaf, name, ptr, name_len); - btrfs_mark_buffer_dirty(leaf); - - if (key.type == BTRFS_ROOT_BACKREF_KEY) { - btrfs_release_path(path); - key.objectid = ref_id; - key.type = BTRFS_ROOT_REF_KEY; - key.offset = root_id; - goto again; - } - - btrfs_free_path(path); - return 0; -} - -/* - * Old btrfs forgets to init root_item->flags and root_item->byte_limit - * for subvolumes. To work around this problem, we steal a bit from - * root_item->inode_item->flags, and use it to indicate if those fields - * have been properly initialized. - */ -void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) -{ - u64 inode_flags = le64_to_cpu(root_item->inode.flags); - - if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { - inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; - root_item->inode.flags = cpu_to_le64(inode_flags); - root_item->flags = 0; - root_item->byte_limit = 0; - } -} diff --git a/ANDROID_3.4.5/fs/btrfs/scrub.c b/ANDROID_3.4.5/fs/btrfs/scrub.c deleted file mode 100644 index 2f3d6f91..00000000 --- a/ANDROID_3.4.5/fs/btrfs/scrub.c +++ /dev/null @@ -1,2440 +0,0 @@ -/* - * Copyright (C) 2011 STRATO. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/blkdev.h> -#include <linux/ratelimit.h> -#include "ctree.h" -#include "volumes.h" -#include "disk-io.h" -#include "ordered-data.h" -#include "transaction.h" -#include "backref.h" -#include "extent_io.h" -#include "check-integrity.h" - -/* - * This is only the first step towards a full-features scrub. It reads all - * extent and super block and verifies the checksums. In case a bad checksum - * is found or the extent cannot be read, good data will be written back if - * any can be found. - * - * Future enhancements: - * - In case an unrepairable extent is encountered, track which files are - * affected and report them - * - track and record media errors, throw out bad devices - * - add a mode to also read unallocated space - */ - -struct scrub_block; -struct scrub_dev; - -#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ -#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ -#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ - -struct scrub_page { - struct scrub_block *sblock; - struct page *page; - struct block_device *bdev; - u64 flags; /* extent flags */ - u64 generation; - u64 logical; - u64 physical; - struct { - unsigned int mirror_num:8; - unsigned int have_csum:1; - unsigned int io_error:1; - }; - u8 csum[BTRFS_CSUM_SIZE]; -}; - -struct scrub_bio { - int index; - struct scrub_dev *sdev; - struct bio *bio; - int err; - u64 logical; - u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; - int page_count; - int next_free; - struct btrfs_work work; -}; - -struct scrub_block { - struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; - int page_count; - atomic_t outstanding_pages; - atomic_t ref_count; /* free mem on transition to zero */ - struct scrub_dev *sdev; - struct { - unsigned int header_error:1; - unsigned int checksum_error:1; - unsigned int no_io_error_seen:1; - }; -}; - -struct scrub_dev { - struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; - struct btrfs_device *dev; - int first_free; - int curr; - atomic_t in_flight; - atomic_t fixup_cnt; - spinlock_t list_lock; - wait_queue_head_t list_wait; - u16 csum_size; - struct list_head csum_list; - atomic_t cancel_req; - int readonly; - int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ - u32 sectorsize; - u32 nodesize; - u32 leafsize; - /* - * statistics - */ - struct btrfs_scrub_progress stat; - spinlock_t stat_lock; -}; - -struct scrub_fixup_nodatasum { - struct scrub_dev *sdev; - u64 logical; - struct btrfs_root *root; - struct btrfs_work work; - int mirror_num; -}; - -struct scrub_warning { - struct btrfs_path *path; - u64 extent_item_size; - char *scratch_buf; - char *msg_buf; - const char *errstr; - sector_t sector; - u64 logical; - struct btrfs_device *dev; - int msg_bufsize; - int scratch_bufsize; -}; - - -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_dev *sdev, - struct btrfs_mapping_tree *map_tree, - u64 length, u64 logical, - struct scrub_block *sblock); -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size); -static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int is_metadata, int have_csum, - const u8 *csum, u64 generation, - u16 csum_size); -static void scrub_complete_bio_end_io(struct bio *bio, int err); -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write); -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write); -static int scrub_checksum_data(struct scrub_block *sblock); -static int scrub_checksum_tree_block(struct scrub_block *sblock); -static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_get(struct scrub_block *sblock); -static void scrub_block_put(struct scrub_block *sblock); -static int scrub_add_page_to_bio(struct scrub_dev *sdev, - struct scrub_page *spage); -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force); -static void scrub_bio_end_io(struct bio *bio, int err); -static void scrub_bio_end_io_worker(struct btrfs_work *work); -static void scrub_block_complete(struct scrub_block *sblock); - - -static void scrub_free_csums(struct scrub_dev *sdev) -{ - while (!list_empty(&sdev->csum_list)) { - struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sdev->csum_list, - struct btrfs_ordered_sum, list); - list_del(&sum->list); - kfree(sum); - } -} - -static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) -{ - int i; - - if (!sdev) - return; - - /* this can happen when scrub is cancelled */ - if (sdev->curr != -1) { - struct scrub_bio *sbio = sdev->bios[sdev->curr]; - - for (i = 0; i < sbio->page_count; i++) { - BUG_ON(!sbio->pagev[i]); - BUG_ON(!sbio->pagev[i]->page); - scrub_block_put(sbio->pagev[i]->sblock); - } - bio_put(sbio->bio); - } - - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct scrub_bio *sbio = sdev->bios[i]; - - if (!sbio) - break; - kfree(sbio); - } - - scrub_free_csums(sdev); - kfree(sdev); -} - -static noinline_for_stack -struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) -{ - struct scrub_dev *sdev; - int i; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_bio; - - pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, - bio_get_nr_vecs(dev->bdev)); - sdev = kzalloc(sizeof(*sdev), GFP_NOFS); - if (!sdev) - goto nomem; - sdev->dev = dev; - sdev->pages_per_bio = pages_per_bio; - sdev->curr = -1; - for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct scrub_bio *sbio; - - sbio = kzalloc(sizeof(*sbio), GFP_NOFS); - if (!sbio) - goto nomem; - sdev->bios[i] = sbio; - - sbio->index = i; - sbio->sdev = sdev; - sbio->page_count = 0; - sbio->work.func = scrub_bio_end_io_worker; - - if (i != SCRUB_BIOS_PER_DEV-1) - sdev->bios[i]->next_free = i + 1; - else - sdev->bios[i]->next_free = -1; - } - sdev->first_free = 0; - sdev->nodesize = dev->dev_root->nodesize; - sdev->leafsize = dev->dev_root->leafsize; - sdev->sectorsize = dev->dev_root->sectorsize; - atomic_set(&sdev->in_flight, 0); - atomic_set(&sdev->fixup_cnt, 0); - atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); - INIT_LIST_HEAD(&sdev->csum_list); - - spin_lock_init(&sdev->list_lock); - spin_lock_init(&sdev->stat_lock); - init_waitqueue_head(&sdev->list_wait); - return sdev; - -nomem: - scrub_free_dev(sdev); - return ERR_PTR(-ENOMEM); -} - -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) -{ - u64 isize; - u32 nlink; - int ret; - int i; - struct extent_buffer *eb; - struct btrfs_inode_item *inode_item; - struct scrub_warning *swarn = ctx; - struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; - struct inode_fs_paths *ipath = NULL; - struct btrfs_root *local_root; - struct btrfs_key root_key; - - root_key.objectid = root; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(local_root)) { - ret = PTR_ERR(local_root); - goto err; - } - - ret = inode_item_info(inum, 0, local_root, swarn->path); - if (ret) { - btrfs_release_path(swarn->path); - goto err; - } - - eb = swarn->path->nodes[0]; - inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], - struct btrfs_inode_item); - isize = btrfs_inode_size(eb, inode_item); - nlink = btrfs_inode_nlink(eb, inode_item); - btrfs_release_path(swarn->path); - - ipath = init_ipath(4096, local_root, swarn->path); - if (IS_ERR(ipath)) { - ret = PTR_ERR(ipath); - ipath = NULL; - goto err; - } - ret = paths_from_inode(inum, ipath); - - if (ret < 0) - goto err; - - /* - * we deliberately ignore the bit ipath might have been too small to - * hold all of the paths here - */ - for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " - "%s, sector %llu, root %llu, inode %llu, offset %llu, " - "length %llu, links %u (path: %s)\n", swarn->errstr, - swarn->logical, swarn->dev->name, - (unsigned long long)swarn->sector, root, inum, offset, - min(isize - offset, (u64)PAGE_SIZE), nlink, - (char *)(unsigned long)ipath->fspath->val[i]); - - free_ipath(ipath); - return 0; - -err: - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " - "%s, sector %llu, root %llu, inode %llu, offset %llu: path " - "resolving failed with ret=%d\n", swarn->errstr, - swarn->logical, swarn->dev->name, - (unsigned long long)swarn->sector, root, inum, offset, ret); - - free_ipath(ipath); - return 0; -} - -static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) -{ - struct btrfs_device *dev = sblock->sdev->dev; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *eb; - struct btrfs_extent_item *ei; - struct scrub_warning swarn; - u32 item_size; - int ret; - u64 ref_root; - u8 ref_level; - unsigned long ptr = 0; - const int bufsize = 4096; - u64 extent_item_pos; - - path = btrfs_alloc_path(); - - swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); - swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); - BUG_ON(sblock->page_count < 1); - swarn.sector = (sblock->pagev[0].physical) >> 9; - swarn.logical = sblock->pagev[0].logical; - swarn.errstr = errstr; - swarn.dev = dev; - swarn.msg_bufsize = bufsize; - swarn.scratch_bufsize = bufsize; - - if (!path || !swarn.scratch_buf || !swarn.msg_buf) - goto out; - - ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); - if (ret < 0) - goto out; - - extent_item_pos = swarn.logical - found_key.objectid; - swarn.extent_item_size = found_key.offset; - - eb = path->nodes[0]; - ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); - - if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - do { - ret = tree_backref_for_extent(&ptr, eb, ei, item_size, - &ref_root, &ref_level); - printk(KERN_WARNING - "btrfs: %s at logical %llu on dev %s, " - "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, dev->name, - (unsigned long long)swarn.sector, - ref_level ? "node" : "leaf", - ret < 0 ? -1 : ref_level, - ret < 0 ? -1 : ref_root); - } while (ret != 1); - } else { - swarn.path = path; - iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, - scrub_print_warning_inode, &swarn); - } - -out: - btrfs_free_path(path); - kfree(swarn.scratch_buf); - kfree(swarn.msg_buf); -} - -static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct page *page = NULL; - unsigned long index; - struct scrub_fixup_nodatasum *fixup = ctx; - int ret; - int corrected = 0; - struct btrfs_key key; - struct inode *inode = NULL; - u64 end = offset + PAGE_SIZE - 1; - struct btrfs_root *local_root; - - key.objectid = root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) - return PTR_ERR(local_root); - - key.type = BTRFS_INODE_ITEM_KEY; - key.objectid = inum; - key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - index = offset >> PAGE_CACHE_SHIFT; - - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; - goto out; - } - - if (PageUptodate(page)) { - struct btrfs_mapping_tree *map_tree; - if (PageDirty(page)) { - /* - * we need to write the data to the defect sector. the - * data that was in that sector is not in memory, - * because the page was modified. we must not write the - * modified page to that sector. - * - * TODO: what could be done here: wait for the delalloc - * runner to write out that page (might involve - * COW) and see whether the sector is still - * referenced afterwards. - * - * For the meantime, we'll treat this error - * incorrectable, although there is a chance that a - * later scrub will find the bad sector again and that - * there's no dirty page in memory, then. - */ - ret = -EIO; - goto out; - } - map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; - ret = repair_io_failure(map_tree, offset, PAGE_SIZE, - fixup->logical, page, - fixup->mirror_num); - unlock_page(page); - corrected = !ret; - } else { - /* - * we need to get good data first. the general readpage path - * will call repair_io_failure for us, we just have to make - * sure we read the bad mirror. - */ - ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); - if (ret) { - /* set_extent_bits should give proper error */ - WARN_ON(ret > 0); - if (ret > 0) - ret = -EFAULT; - goto out; - } - - ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, - btrfs_get_extent, - fixup->mirror_num); - wait_on_page_locked(page); - - corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, - end, EXTENT_DAMAGED, 0, NULL); - if (!corrected) - clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); - } - -out: - if (page) - put_page(page); - if (inode) - iput(inode); - - if (ret < 0) - return ret; - - if (ret == 0 && corrected) { - /* - * we only need to call readpage for one of the inodes belonging - * to this extent. so make iterate_extent_inodes stop - */ - return 1; - } - - return -EIO; -} - -static void scrub_fixup_nodatasum(struct btrfs_work *work) -{ - int ret; - struct scrub_fixup_nodatasum *fixup; - struct scrub_dev *sdev; - struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info; - struct btrfs_path *path; - int uncorrectable = 0; - - fixup = container_of(work, struct scrub_fixup_nodatasum, work); - sdev = fixup->sdev; - fs_info = fixup->root->fs_info; - - path = btrfs_alloc_path(); - if (!path) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.malloc_errors; - spin_unlock(&sdev->stat_lock); - uncorrectable = 1; - goto out; - } - - trans = btrfs_join_transaction(fixup->root); - if (IS_ERR(trans)) { - uncorrectable = 1; - goto out; - } - - /* - * the idea is to trigger a regular read through the standard path. we - * read a page from the (failed) logical address by specifying the - * corresponding copynum of the failed sector. thus, that readpage is - * expected to fail. - * that is the point where on-the-fly error correction will kick in - * (once it's finished) and rewrite the failed sector if a good copy - * can be found. - */ - ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, - path, scrub_fixup_readpage, - fixup); - if (ret < 0) { - uncorrectable = 1; - goto out; - } - WARN_ON(ret != 1); - - spin_lock(&sdev->stat_lock); - ++sdev->stat.corrected_errors; - spin_unlock(&sdev->stat_lock); - -out: - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans, fixup->root); - if (uncorrectable) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.uncorrectable_errors; - spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR - "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, sdev->dev->name); - } - - btrfs_free_path(path); - kfree(fixup); - - /* see caller why we're pretending to be paused in the scrub counters */ - mutex_lock(&fs_info->scrub_lock); - atomic_dec(&fs_info->scrubs_running); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_dec(&sdev->fixup_cnt); - wake_up(&fs_info->scrub_pause_wait); - wake_up(&sdev->list_wait); -} - -/* - * scrub_handle_errored_block gets called when either verification of the - * pages failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all pages in the bio, even though only one - * may be bad. - * The goal of this function is to repair the errored block by using the - * contents of one of the mirrors. - */ -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) -{ - struct scrub_dev *sdev = sblock_to_check->sdev; - struct btrfs_fs_info *fs_info; - u64 length; - u64 logical; - u64 generation; - unsigned int failed_mirror_index; - unsigned int is_metadata; - unsigned int have_csum; - u8 *csum; - struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ - struct scrub_block *sblock_bad; - int ret; - int mirror_index; - int page_num; - int success; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - BUG_ON(sblock_to_check->page_count < 1); - fs_info = sdev->dev->dev_root->fs_info; - length = sblock_to_check->page_count * PAGE_SIZE; - logical = sblock_to_check->pagev[0].logical; - generation = sblock_to_check->pagev[0].generation; - BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0].flags & - BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0].have_csum; - csum = sblock_to_check->pagev[0].csum; - - /* - * read all mirrors one after the other. This includes to - * re-read the extent or metadata block that failed (that was - * the cause that this fixup code is called) another time, - * page by page this time in order to know which pages - * caused I/O errors and which ones are good (for all mirrors). - * It is the goal to handle the situation when more than one - * mirror contains I/O errors, but the errors do not - * overlap, i.e. the data can be repaired by selecting the - * pages from those mirrors without I/O error on the - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) - * would be that mirror #1 has an I/O error on the first page, - * the second page is good, and mirror #2 has an I/O error on - * the second page, but the first page is good. - * Then the first page of the first mirror can be repaired by - * taking the first page of the second mirror, and the - * second page of the second mirror can be repaired by - * copying the contents of the 2nd page of the 1st mirror. - * One more note: if the pages of one mirror contain I/O - * errors, the checksum cannot be verified. In order to get - * the best data for repairing, the first attempt is to find - * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the pages are picked from - * mirrors with I/O errors without considering the checksum. - * If the latter is the case, at the end, the checksum of the - * repaired area is verified in order to correctly maintain - * the statistics. - */ - - sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * - sizeof(*sblocks_for_recheck), - GFP_NOFS); - if (!sblocks_for_recheck) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - goto out; - } - - /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, - logical, sblocks_for_recheck); - if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - goto out; - } - BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); - sblock_bad = sblocks_for_recheck + failed_mirror_index; - - /* build and submit the bios for the failed mirror, check checksums */ - ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sdev->csum_size); - if (ret) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - goto out; - } - - if (!sblock_bad->header_error && !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) { - /* - * the error disappeared after reading page by page, or - * the area was part of a huge bio and other parts of the - * bio caused I/O errors, or the block layer merged several - * read requests into one and the error is caused by a - * different bio (usually one of the two latter cases is - * the cause) - */ - spin_lock(&sdev->stat_lock); - sdev->stat.unverified_errors++; - spin_unlock(&sdev->stat_lock); - - goto out; - } - - if (!sblock_bad->no_io_error_seen) { - spin_lock(&sdev->stat_lock); - sdev->stat.read_errors++; - spin_unlock(&sdev->stat_lock); - if (__ratelimit(&_rs)) - scrub_print_warning("i/o error", sblock_to_check); - } else if (sblock_bad->checksum_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.csum_errors++; - spin_unlock(&sdev->stat_lock); - if (__ratelimit(&_rs)) - scrub_print_warning("checksum error", sblock_to_check); - } else if (sblock_bad->header_error) { - spin_lock(&sdev->stat_lock); - sdev->stat.verify_errors++; - spin_unlock(&sdev->stat_lock); - if (__ratelimit(&_rs)) - scrub_print_warning("checksum/header error", - sblock_to_check); - } - - if (sdev->readonly) - goto did_not_correct_error; - - if (!is_metadata && !have_csum) { - struct scrub_fixup_nodatasum *fixup_nodatasum; - - /* - * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified - * concurrently. The general strategy to work on the - * commit root does not help in the case when COW is not - * used. - */ - fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); - if (!fixup_nodatasum) - goto did_not_correct_error; - fixup_nodatasum->sdev = sdev; - fixup_nodatasum->logical = logical; - fixup_nodatasum->root = fs_info->extent_root; - fixup_nodatasum->mirror_num = failed_mirror_index + 1; - /* - * increment scrubs_running to prevent cancel requests from - * completing as long as a fixup worker is running. we must also - * increment scrubs_paused to prevent deadlocking on pause - * requests used for transactions commits (as the worker uses a - * transaction context). it is safe to regard the fixup worker - * as paused for all matters practical. effectively, we only - * avoid cancellation requests from completing. - */ - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrubs_running); - atomic_inc(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - atomic_inc(&sdev->fixup_cnt); - fixup_nodatasum->work.func = scrub_fixup_nodatasum; - btrfs_queue_worker(&fs_info->scrub_workers, - &fixup_nodatasum->work); - goto out; - } - - /* - * now build and submit the bios for the other mirrors, check - * checksums - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - if (mirror_index == failed_mirror_index) - continue; - - /* build and submit the bios, check checksums */ - ret = scrub_recheck_block(fs_info, - sblocks_for_recheck + mirror_index, - is_metadata, have_csum, csum, - generation, sdev->csum_size); - if (ret) - goto did_not_correct_error; - } - - /* - * first try to pick the mirror which is completely without I/O - * errors and also does not have a checksum error. - * If one is found, and if a checksum is present, the full block - * that is known to contain an error is rewritten. Afterwards - * the block is known to be corrected. - * If a mirror is found which is completely correct, and no - * checksum is present, only those pages are rewritten that had - * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other pages is better (and it - * could happen otherwise that a correct page would be - * overwritten by a bad one). - */ - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - - if (!sblock_other->header_error && - !sblock_other->checksum_error && - sblock_other->no_io_error_seen) { - int force_write = is_metadata || have_csum; - - ret = scrub_repair_block_from_good_copy(sblock_bad, - sblock_other, - force_write); - if (0 == ret) - goto corrected_error; - } - } - - /* - * in case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those pages. - * Select the good pages from mirrors to rewrite bad pages from - * the area to fix. Afterwards verify the checksum of the block - * that is supposed to be repaired. This verification step is - * only done for the purpose of statistic counting and for the - * final scrub report, whether errors remain. - * A perfect algorithm could make use of the checksum and try - * all possible combinations of pages from the different mirrors - * until the checksum verification succeeds. For example, when - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page - * of mirror #2 is readable but the final checksum test fails, - * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare - * exception and is therefore not implemented. At least it is - * avoided that the good copy is overwritten. - * A more useful improvement would be to pick the sectors - * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one - * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same PAGE_SIZE - * area are unreadable. - */ - - /* can only fix I/O errors from here on */ - if (sblock_bad->no_io_error_seen) - goto did_not_correct_error; - - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - - if (!page_bad->io_error) - continue; - - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - struct scrub_page *page_other = sblock_other->pagev + - page_num; - - if (!page_other->io_error) { - ret = scrub_repair_page_from_good_copy( - sblock_bad, sblock_other, page_num, 0); - if (0 == ret) { - page_bad->io_error = 0; - break; /* succeeded for this page */ - } - } - } - - if (page_bad->io_error) { - /* did not find a mirror to copy the page from */ - success = 0; - } - } - - if (success) { - if (is_metadata || have_csum) { - /* - * need to verify the checksum now that all - * sectors on disk are repaired (the write - * request for data to be repaired is on its way). - * Just be lazy and use scrub_recheck_block() - * which re-reads the data before the checksum - * is verified, but most likely the data comes out - * of the page cache. - */ - ret = scrub_recheck_block(fs_info, sblock_bad, - is_metadata, have_csum, csum, - generation, sdev->csum_size); - if (!ret && !sblock_bad->header_error && - !sblock_bad->checksum_error && - sblock_bad->no_io_error_seen) - goto corrected_error; - else - goto did_not_correct_error; - } else { -corrected_error: - spin_lock(&sdev->stat_lock); - sdev->stat.corrected_errors++; - spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR - "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); - } - } else { -did_not_correct_error: - spin_lock(&sdev->stat_lock); - sdev->stat.uncorrectable_errors++; - spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR - "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); - } - -out: - if (sblocks_for_recheck) { - for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; - mirror_index++) { - struct scrub_block *sblock = sblocks_for_recheck + - mirror_index; - int page_index; - - for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; - page_index++) - if (sblock->pagev[page_index].page) - __free_page( - sblock->pagev[page_index].page); - } - kfree(sblocks_for_recheck); - } - - return 0; -} - -static int scrub_setup_recheck_block(struct scrub_dev *sdev, - struct btrfs_mapping_tree *map_tree, - u64 length, u64 logical, - struct scrub_block *sblocks_for_recheck) -{ - int page_index; - int mirror_index; - int ret; - - /* - * note: the three members sdev, ref_count and outstanding_pages - * are not used (and not set) in the blocks that are used for - * the recheck procedure - */ - - page_index = 0; - while (length > 0) { - u64 sublen = min_t(u64, length, PAGE_SIZE); - u64 mapped_length = sublen; - struct btrfs_bio *bbio = NULL; - - /* - * with a length of PAGE_SIZE, each returned stripe - * represents one mirror - */ - ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, - &bbio, 0); - if (ret || !bbio || mapped_length < sublen) { - kfree(bbio); - return -EIO; - } - - BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); - for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; - mirror_index++) { - struct scrub_block *sblock; - struct scrub_page *page; - - if (mirror_index >= BTRFS_MAX_MIRRORS) - continue; - - sblock = sblocks_for_recheck + mirror_index; - page = sblock->pagev + page_index; - page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; - /* for missing devices, bdev is NULL */ - page->bdev = bbio->stripes[mirror_index].dev->bdev; - page->mirror_num = mirror_index + 1; - page->page = alloc_page(GFP_NOFS); - if (!page->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); - return -ENOMEM; - } - sblock->page_count++; - } - kfree(bbio); - length -= sublen; - logical += sublen; - page_index++; - } - - return 0; -} - -/* - * this function will check the on disk data for checksum errors, header - * errors and read I/O errors. If any I/O errors happen, the exact pages - * which are errored are marked as being bad. The goal is to enable scrub - * to take those pages that are not errored from all the mirrors so that - * the pages that are errored in the just handled mirror can be repaired. - */ -static int scrub_recheck_block(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, int is_metadata, - int have_csum, u8 *csum, u64 generation, - u16 csum_size) -{ - int page_num; - - sblock->no_io_error_seen = 1; - sblock->header_error = 0; - sblock->checksum_error = 0; - - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct bio *bio; - int ret; - struct scrub_page *page = sblock->pagev + page_num; - DECLARE_COMPLETION_ONSTACK(complete); - - if (page->bdev == NULL) { - page->io_error = 1; - sblock->no_io_error_seen = 0; - continue; - } - - BUG_ON(!page->page); - bio = bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; - bio->bi_bdev = page->bdev; - bio->bi_sector = page->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; - - ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; - } - btrfsic_submit_bio(READ, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - - page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - sblock->no_io_error_seen = 0; - bio_put(bio); - } - - if (sblock->no_io_error_seen) - scrub_recheck_block_checksum(fs_info, sblock, is_metadata, - have_csum, csum, generation, - csum_size); - - return 0; -} - -static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, - struct scrub_block *sblock, - int is_metadata, int have_csum, - const u8 *csum, u64 generation, - u16 csum_size) -{ - int page_num; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; - struct btrfs_root *root = fs_info->extent_root; - void *mapped_buffer; - - BUG_ON(!sblock->pagev[0].page); - if (is_metadata) { - struct btrfs_header *h; - - mapped_buffer = kmap_atomic(sblock->pagev[0].page); - h = (struct btrfs_header *)mapped_buffer; - - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || - generation != le64_to_cpu(h->generation) || - memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || - memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) - sblock->header_error = 1; - csum = h->csum; - } else { - if (!have_csum) - return; - - mapped_buffer = kmap_atomic(sblock->pagev[0].page); - } - - for (page_num = 0;;) { - if (page_num == 0 && is_metadata) - crc = btrfs_csum_data(root, - ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, - crc, PAGE_SIZE - BTRFS_CSUM_SIZE); - else - crc = btrfs_csum_data(root, mapped_buffer, crc, - PAGE_SIZE); - - kunmap_atomic(mapped_buffer); - page_num++; - if (page_num >= sblock->page_count) - break; - BUG_ON(!sblock->pagev[page_num].page); - - mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); - } - - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, csum, csum_size)) - sblock->checksum_error = 1; -} - -static void scrub_complete_bio_end_io(struct bio *bio, int err) -{ - complete((struct completion *)bio->bi_private); -} - -static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write) -{ - int page_num; - int ret = 0; - - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { - int ret_sub; - - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, - sblock_good, - page_num, - force_write); - if (ret_sub) - ret = ret_sub; - } - - return ret; -} - -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write) -{ - struct scrub_page *page_bad = sblock_bad->pagev + page_num; - struct scrub_page *page_good = sblock_good->pagev + page_num; - - BUG_ON(sblock_bad->pagev[page_num].page == NULL); - BUG_ON(sblock_good->pagev[page_num].page == NULL); - if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || page_bad->io_error) { - struct bio *bio; - int ret; - DECLARE_COMPLETION_ONSTACK(complete); - - bio = bio_alloc(GFP_NOFS, 1); - if (!bio) - return -EIO; - bio->bi_bdev = page_bad->bdev; - bio->bi_sector = page_bad->physical >> 9; - bio->bi_end_io = scrub_complete_bio_end_io; - bio->bi_private = &complete; - - ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { - bio_put(bio); - return -EIO; - } - btrfsic_submit_bio(WRITE, bio); - - /* this will also unplug the queue */ - wait_for_completion(&complete); - bio_put(bio); - } - - return 0; -} - -static void scrub_checksum(struct scrub_block *sblock) -{ - u64 flags; - int ret; - - BUG_ON(sblock->page_count < 1); - flags = sblock->pagev[0].flags; - ret = 0; - if (flags & BTRFS_EXTENT_FLAG_DATA) - ret = scrub_checksum_data(sblock); - else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) - ret = scrub_checksum_tree_block(sblock); - else if (flags & BTRFS_EXTENT_FLAG_SUPER) - (void)scrub_checksum_super(sblock); - else - WARN_ON(1); - if (ret) - scrub_handle_errored_block(sblock); -} - -static int scrub_checksum_data(struct scrub_block *sblock) -{ - struct scrub_dev *sdev = sblock->sdev; - u8 csum[BTRFS_CSUM_SIZE]; - u8 *on_disk_csum; - struct page *page; - void *buffer; - u32 crc = ~(u32)0; - int fail = 0; - struct btrfs_root *root = sdev->dev->dev_root; - u64 len; - int index; - - BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0].have_csum) - return 0; - - on_disk_csum = sblock->pagev[0].csum; - page = sblock->pagev[0].page; - buffer = kmap_atomic(page); - - len = sdev->sectorsize; - index = 0; - for (;;) { - u64 l = min_t(u64, len, PAGE_SIZE); - - crc = btrfs_csum_data(root, buffer, crc, l); - kunmap_atomic(buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; - buffer = kmap_atomic(page); - } - - btrfs_csum_final(crc, csum); - if (memcmp(csum, on_disk_csum, sdev->csum_size)) - fail = 1; - - return fail; -} - -static int scrub_checksum_tree_block(struct scrub_block *sblock) -{ - struct scrub_dev *sdev = sblock->sdev; - struct btrfs_header *h; - struct btrfs_root *root = sdev->dev->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; - u32 crc = ~(u32)0; - int fail = 0; - int crc_fail = 0; - u64 len; - int index; - - BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; - mapped_buffer = kmap_atomic(page); - h = (struct btrfs_header *)mapped_buffer; - memcpy(on_disk_csum, h->csum, sdev->csum_size); - - /* - * we don't use the getter functions here, as we - * a) don't have an extent buffer and - * b) the page is already kmapped - */ - - if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) - ++fail; - - if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) - ++fail; - - if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) - ++fail; - - if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) - ++fail; - - BUG_ON(sdev->nodesize != sdev->leafsize); - len = sdev->nodesize - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); - - crc = btrfs_csum_data(root, p, crc, l); - kunmap_atomic(mapped_buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; - mapped_buffer = kmap_atomic(page); - mapped_size = PAGE_SIZE; - p = mapped_buffer; - } - - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) - ++crc_fail; - - return fail || crc_fail; -} - -static int scrub_checksum_super(struct scrub_block *sblock) -{ - struct btrfs_super_block *s; - struct scrub_dev *sdev = sblock->sdev; - struct btrfs_root *root = sdev->dev->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; - struct page *page; - void *mapped_buffer; - u64 mapped_size; - void *p; - u32 crc = ~(u32)0; - int fail = 0; - u64 len; - int index; - - BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0].page; - mapped_buffer = kmap_atomic(page); - s = (struct btrfs_super_block *)mapped_buffer; - memcpy(on_disk_csum, s->csum, sdev->csum_size); - - if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) - ++fail; - - if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) - ++fail; - - if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) - ++fail; - - len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); - - crc = btrfs_csum_data(root, p, crc, l); - kunmap_atomic(mapped_buffer); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index].page); - page = sblock->pagev[index].page; - mapped_buffer = kmap_atomic(page); - mapped_size = PAGE_SIZE; - p = mapped_buffer; - } - - btrfs_csum_final(crc, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) - ++fail; - - if (fail) { - /* - * if we find an error in a super block, we just report it. - * They will get written with the next transaction commit - * anyway - */ - spin_lock(&sdev->stat_lock); - ++sdev->stat.super_errors; - spin_unlock(&sdev->stat_lock); - } - - return fail; -} - -static void scrub_block_get(struct scrub_block *sblock) -{ - atomic_inc(&sblock->ref_count); -} - -static void scrub_block_put(struct scrub_block *sblock) -{ - if (atomic_dec_and_test(&sblock->ref_count)) { - int i; - - for (i = 0; i < sblock->page_count; i++) - if (sblock->pagev[i].page) - __free_page(sblock->pagev[i].page); - kfree(sblock); - } -} - -static void scrub_submit(struct scrub_dev *sdev) -{ - struct scrub_bio *sbio; - - if (sdev->curr == -1) - return; - - sbio = sdev->bios[sdev->curr]; - sdev->curr = -1; - atomic_inc(&sdev->in_flight); - - btrfsic_submit_bio(READ, sbio->bio); -} - -static int scrub_add_page_to_bio(struct scrub_dev *sdev, - struct scrub_page *spage) -{ - struct scrub_block *sblock = spage->sblock; - struct scrub_bio *sbio; - int ret; - -again: - /* - * grab a fresh bio or wait for one to become available - */ - while (sdev->curr == -1) { - spin_lock(&sdev->list_lock); - sdev->curr = sdev->first_free; - if (sdev->curr != -1) { - sdev->first_free = sdev->bios[sdev->curr]->next_free; - sdev->bios[sdev->curr]->next_free = -1; - sdev->bios[sdev->curr]->page_count = 0; - spin_unlock(&sdev->list_lock); - } else { - spin_unlock(&sdev->list_lock); - wait_event(sdev->list_wait, sdev->first_free != -1); - } - } - sbio = sdev->bios[sdev->curr]; - if (sbio->page_count == 0) { - struct bio *bio; - - sbio->physical = spage->physical; - sbio->logical = spage->logical; - bio = sbio->bio; - if (!bio) { - bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); - if (!bio) - return -ENOMEM; - sbio->bio = bio; - } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = spage->physical >> 9; - sbio->err = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != - spage->physical || - sbio->logical + sbio->page_count * PAGE_SIZE != - spage->logical) { - scrub_submit(sdev); - goto again; - } - - sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { - if (sbio->page_count < 1) { - bio_put(sbio->bio); - sbio->bio = NULL; - return -EIO; - } - scrub_submit(sdev); - goto again; - } - - scrub_block_get(sblock); /* one for the added page */ - atomic_inc(&sblock->outstanding_pages); - sbio->page_count++; - if (sbio->page_count == sdev->pages_per_bio) - scrub_submit(sdev); - - return 0; -} - -static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num, - u8 *csum, int force) -{ - struct scrub_block *sblock; - int index; - - sblock = kzalloc(sizeof(*sblock), GFP_NOFS); - if (!sblock) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); - return -ENOMEM; - } - - /* one ref inside this function, plus one for each page later on */ - atomic_set(&sblock->ref_count, 1); - sblock->sdev = sdev; - sblock->no_io_error_seen = 1; - - for (index = 0; len > 0; index++) { - struct scrub_page *spage = sblock->pagev + index; - u64 l = min_t(u64, len, PAGE_SIZE); - - BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) { - spin_lock(&sdev->stat_lock); - sdev->stat.malloc_errors++; - spin_unlock(&sdev->stat_lock); - while (index > 0) { - index--; - __free_page(sblock->pagev[index].page); - } - kfree(sblock); - return -ENOMEM; - } - spage->sblock = sblock; - spage->bdev = sdev->dev->bdev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->mirror_num = mirror_num; - if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sdev->csum_size); - } else { - spage->have_csum = 0; - } - sblock->page_count++; - len -= l; - logical += l; - physical += l; - } - - BUG_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev + index; - int ret; - - ret = scrub_add_page_to_bio(sdev, spage); - if (ret) { - scrub_block_put(sblock); - return ret; - } - } - - if (force) - scrub_submit(sdev); - - /* last one frees, either here or in bio completion for last page */ - scrub_block_put(sblock); - return 0; -} - -static void scrub_bio_end_io(struct bio *bio, int err) -{ - struct scrub_bio *sbio = bio->bi_private; - struct scrub_dev *sdev = sbio->sdev; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - - sbio->err = err; - sbio->bio = bio; - - btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); -} - -static void scrub_bio_end_io_worker(struct btrfs_work *work) -{ - struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); - struct scrub_dev *sdev = sbio->sdev; - int i; - - BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); - if (sbio->err) { - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - - spage->io_error = 1; - spage->sblock->no_io_error_seen = 0; - } - } - - /* now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - struct scrub_block *sblock = spage->sblock; - - if (atomic_dec_and_test(&sblock->outstanding_pages)) - scrub_block_complete(sblock); - scrub_block_put(sblock); - } - - if (sbio->err) { - /* what is this good for??? */ - sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_phys_segments = 0; - sbio->bio->bi_idx = 0; - - for (i = 0; i < sbio->page_count; i++) { - struct bio_vec *bi; - bi = &sbio->bio->bi_io_vec[i]; - bi->bv_offset = 0; - bi->bv_len = PAGE_SIZE; - } - } - - bio_put(sbio->bio); - sbio->bio = NULL; - spin_lock(&sdev->list_lock); - sbio->next_free = sdev->first_free; - sdev->first_free = sbio->index; - spin_unlock(&sdev->list_lock); - atomic_dec(&sdev->in_flight); - wake_up(&sdev->list_wait); -} - -static void scrub_block_complete(struct scrub_block *sblock) -{ - if (!sblock->no_io_error_seen) - scrub_handle_errored_block(sblock); - else - scrub_checksum(sblock); -} - -static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, - u8 *csum) -{ - struct btrfs_ordered_sum *sum = NULL; - int ret = 0; - unsigned long i; - unsigned long num_sectors; - - while (!list_empty(&sdev->csum_list)) { - sum = list_first_entry(&sdev->csum_list, - struct btrfs_ordered_sum, list); - if (sum->bytenr > logical) - return 0; - if (sum->bytenr + sum->len > logical) - break; - - ++sdev->stat.csum_discards; - list_del(&sum->list); - kfree(sum); - sum = NULL; - } - if (!sum) - return 0; - - num_sectors = sum->len / sdev->sectorsize; - for (i = 0; i < num_sectors; ++i) { - if (sum->sums[i].bytenr == logical) { - memcpy(csum, &sum->sums[i].sum, sdev->csum_size); - ret = 1; - break; - } - } - if (ret && i == num_sectors - 1) { - list_del(&sum->list); - kfree(sum); - } - return ret; -} - -/* scrub extent tries to collect up to 64 kB for each bio */ -static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, int mirror_num) -{ - int ret; - u8 csum[BTRFS_CSUM_SIZE]; - u32 blocksize; - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - blocksize = sdev->sectorsize; - spin_lock(&sdev->stat_lock); - sdev->stat.data_extents_scrubbed++; - sdev->stat.data_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); - } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - BUG_ON(sdev->nodesize != sdev->leafsize); - blocksize = sdev->nodesize; - spin_lock(&sdev->stat_lock); - sdev->stat.tree_extents_scrubbed++; - sdev->stat.tree_bytes_scrubbed += len; - spin_unlock(&sdev->stat_lock); - } else { - blocksize = sdev->sectorsize; - BUG_ON(1); - } - - while (len) { - u64 l = min_t(u64, len, blocksize); - int have_csum = 0; - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - /* push csums to sbio */ - have_csum = scrub_find_csum(sdev, logical, l, csum); - if (have_csum == 0) - ++sdev->stat.no_csum; - } - ret = scrub_pages(sdev, logical, l, physical, flags, gen, - mirror_num, have_csum ? csum : NULL, 0); - if (ret) - return ret; - len -= l; - logical += l; - physical += l; - } - return 0; -} - -static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, - struct map_lookup *map, int num, u64 base, u64 length) -{ - struct btrfs_path *path; - struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_root *csum_root = fs_info->csum_root; - struct btrfs_extent_item *extent; - struct blk_plug plug; - u64 flags; - int ret; - int slot; - int i; - u64 nstripes; - struct extent_buffer *l; - struct btrfs_key key; - u64 physical; - u64 logical; - u64 generation; - int mirror_num; - struct reada_control *reada1; - struct reada_control *reada2; - struct btrfs_key key_start; - struct btrfs_key key_end; - - u64 increment = map->stripe_len; - u64 offset; - - nstripes = length; - offset = 0; - do_div(nstripes, map->stripe_len); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * num; - increment = map->stripe_len * map->num_stripes; - mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (num / map->sub_stripes); - increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - increment = map->stripe_len; - mirror_num = num % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - increment = map->stripe_len; - mirror_num = num % map->num_stripes + 1; - } else { - increment = map->stripe_len; - mirror_num = 1; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * work on commit root. The related disk blocks are static as - * long as COW is applied. This means, it is save to rewrite - * them to repair disk errors without any race conditions - */ - path->search_commit_root = 1; - path->skip_locking = 1; - - /* - * trigger the readahead for extent tree csum tree and wait for - * completion. During readahead, the scrub is officially paused - * to not hold off transaction commits - */ - logical = base + offset; - - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); - - /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; - key_end.objectid = base + offset + nstripes * increment; - key_end.type = BTRFS_EXTENT_ITEM_KEY; - key_end.offset = (u64)0; - reada1 = btrfs_reada_add(root, &key_start, &key_end); - - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = base + offset + nstripes * increment; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); - - if (!IS_ERR(reada1)) - btrfs_reada_wait(reada1); - if (!IS_ERR(reada2)) - btrfs_reada_wait(reada2); - - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); - - /* - * collect all data csums for the stripe to avoid seeking during - * the scrub. This might currently (crc32) end up to be about 1MB - */ - blk_start_plug(&plug); - - /* - * now find all extents for each stripe and scrub them - */ - logical = base + offset; - physical = map->stripes[num].physical; - ret = 0; - for (i = 0; i < nstripes; ++i) { - /* - * canceled? - */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sdev->cancel_req)) { - ret = -ECANCELED; - goto out; - } - /* - * check to see if we have to pause - */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* push queued extents */ - scrub_submit(sdev); - wait_event(sdev->list_wait, - atomic_read(&sdev->in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); - mutex_lock(&fs_info->scrub_lock); - while (atomic_read(&fs_info->scrub_pause_req)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrub_pause_req) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); - } - - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sdev->csum_list, 1); - if (ret) - goto out; - - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - goto out; - if (ret > 0) { - /* there's no smaller item, so stick with the - * larger one */ - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid + key.offset <= logical) - goto next; - - if (key.objectid >= logical + map->stripe_len) - break; - - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) - goto next; - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if (key.objectid < logical && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - printk(KERN_ERR - "btrfs scrub: tree block %llu spanning " - "stripes, ignored. logical=%llu\n", - (unsigned long long)key.objectid, - (unsigned long long)logical); - goto next; - } - - /* - * trim extent to this stripe - */ - if (key.objectid < logical) { - key.offset -= logical - key.objectid; - key.objectid = logical; - } - if (key.objectid + key.offset > - logical + map->stripe_len) { - key.offset = logical + map->stripe_len - - key.objectid; - } - - ret = scrub_extent(sdev, key.objectid, key.offset, - key.objectid - logical + physical, - flags, generation, mirror_num); - if (ret) - goto out; - -next: - path->slots[0]++; - } - btrfs_release_path(path); - logical += increment; - physical += map->stripe_len; - spin_lock(&sdev->stat_lock); - sdev->stat.last_physical = physical; - spin_unlock(&sdev->stat_lock); - } - /* push queued extents */ - scrub_submit(sdev); - -out: - blk_finish_plug(&plug); - btrfs_free_path(path); - return ret < 0 ? ret : 0; -} - -static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, - u64 dev_offset) -{ - struct btrfs_mapping_tree *map_tree = - &sdev->dev->dev_root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - int i; - int ret = -EINVAL; - - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - - if (!em) - return -EINVAL; - - map = (struct map_lookup *)em->bdev; - if (em->start != chunk_offset) - goto out; - - if (em->len < length) - goto out; - - for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev && - map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sdev, map, i, chunk_offset, length); - if (ret) - goto out; - } - } -out: - free_extent_map(em); - - return ret; -} - -static noinline_for_stack -int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) -{ - struct btrfs_dev_extent *dev_extent = NULL; - struct btrfs_path *path; - struct btrfs_root *root = sdev->dev->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; - u64 length; - u64 chunk_tree; - u64 chunk_objectid; - u64 chunk_offset; - int ret; - int slot; - struct extent_buffer *l; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_block_group_cache *cache; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = 2; - path->search_commit_root = 1; - path->skip_locking = 1; - - key.objectid = sdev->dev->devid; - key.offset = 0ull; - key.type = BTRFS_DEV_EXTENT_KEY; - - - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - break; - if (ret > 0) { - if (path->slots[0] >= - btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - } - } - - l = path->nodes[0]; - slot = path->slots[0]; - - btrfs_item_key_to_cpu(l, &found_key, slot); - - if (found_key.objectid != sdev->dev->devid) - break; - - if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) - break; - - if (found_key.offset >= end) - break; - - if (found_key.offset < key.offset) - break; - - dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - length = btrfs_dev_extent_length(l, dev_extent); - - if (found_key.offset + length <= start) { - key.offset = found_key.offset + length; - btrfs_release_path(path); - continue; - } - - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); - chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); - - /* - * get a reference on the corresponding block group to prevent - * the chunk from going away while we scrub it - */ - cache = btrfs_lookup_block_group(fs_info, chunk_offset); - if (!cache) { - ret = -ENOENT; - break; - } - ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, - chunk_offset, length, found_key.offset); - btrfs_put_block_group(cache); - if (ret) - break; - - key.offset = found_key.offset + length; - btrfs_release_path(path); - } - - btrfs_free_path(path); - - /* - * ret can still be 1 from search_slot or next_leaf, - * that's not an error - */ - return ret < 0 ? ret : 0; -} - -static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) -{ - int i; - u64 bytenr; - u64 gen; - int ret; - struct btrfs_device *device = sdev->dev; - struct btrfs_root *root = device->dev_root; - - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) - return -EIO; - - gen = root->fs_info->last_trans_committed; - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) - break; - - ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); - if (ret) - return ret; - } - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - - return 0; -} - -/* - * get a reference count on fs_info->scrub_workers. start worker if necessary - */ -static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret = 0; - - mutex_lock(&fs_info->scrub_lock); - if (fs_info->scrub_workers_refcnt == 0) { - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); - fs_info->scrub_workers.idle_thresh = 4; - ret = btrfs_start_workers(&fs_info->scrub_workers); - if (ret) - goto out; - } - ++fs_info->scrub_workers_refcnt; -out: - mutex_unlock(&fs_info->scrub_lock); - - return ret; -} - -static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - - mutex_lock(&fs_info->scrub_lock); - if (--fs_info->scrub_workers_refcnt == 0) - btrfs_stop_workers(&fs_info->scrub_workers); - WARN_ON(fs_info->scrub_workers_refcnt < 0); - mutex_unlock(&fs_info->scrub_lock); -} - - -int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, - struct btrfs_scrub_progress *progress, int readonly) -{ - struct scrub_dev *sdev; - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - struct btrfs_device *dev; - - if (btrfs_fs_closing(root->fs_info)) - return -EINVAL; - - /* - * check some assumptions - */ - if (root->nodesize != root->leafsize) { - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", - root->nodesize, root->leafsize); - return -EINVAL; - } - - if (root->nodesize > BTRFS_STRIPE_LEN) { - /* - * in this case scrub is unable to calculate the checksum - * the way scrub is implemented. Do not handle this - * situation at all because it won't ever happen. - */ - printk(KERN_ERR - "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", - root->nodesize, BTRFS_STRIPE_LEN); - return -EINVAL; - } - - if (root->sectorsize != PAGE_SIZE) { - /* not supported for data w/o checksums */ - printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - root->sectorsize, (unsigned long long)PAGE_SIZE); - return -EINVAL; - } - - ret = scrub_workers_get(root); - if (ret) - return ret; - - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); - if (!dev || dev->missing) { - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return -ENODEV; - } - mutex_lock(&fs_info->scrub_lock); - - if (!dev->in_fs_metadata) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return -ENODEV; - } - - if (dev->scrub_device) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return -EINPROGRESS; - } - sdev = scrub_setup_dev(dev); - if (IS_ERR(sdev)) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - scrub_workers_put(root); - return PTR_ERR(sdev); - } - sdev->readonly = readonly; - dev->scrub_device = sdev; - - atomic_inc(&fs_info->scrubs_running); - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - - down_read(&fs_info->scrub_super_lock); - ret = scrub_supers(sdev); - up_read(&fs_info->scrub_super_lock); - - if (!ret) - ret = scrub_enumerate_chunks(sdev, start, end); - - wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - atomic_dec(&fs_info->scrubs_running); - wake_up(&fs_info->scrub_pause_wait); - - wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); - - if (progress) - memcpy(progress, &sdev->stat, sizeof(*progress)); - - mutex_lock(&fs_info->scrub_lock); - dev->scrub_device = NULL; - mutex_unlock(&fs_info->scrub_lock); - - scrub_free_dev(sdev); - scrub_workers_put(root); - - return ret; -} - -void btrfs_scrub_pause(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - - mutex_lock(&fs_info->scrub_lock); - atomic_inc(&fs_info->scrub_pause_req); - while (atomic_read(&fs_info->scrubs_paused) != - atomic_read(&fs_info->scrubs_running)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrubs_paused) == - atomic_read(&fs_info->scrubs_running)); - mutex_lock(&fs_info->scrub_lock); - } - mutex_unlock(&fs_info->scrub_lock); -} - -void btrfs_scrub_continue(struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - - atomic_dec(&fs_info->scrub_pause_req); - wake_up(&fs_info->scrub_pause_wait); -} - -void btrfs_scrub_pause_super(struct btrfs_root *root) -{ - down_write(&root->fs_info->scrub_super_lock); -} - -void btrfs_scrub_continue_super(struct btrfs_root *root) -{ - up_write(&root->fs_info->scrub_super_lock); -} - -int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) -{ - - mutex_lock(&fs_info->scrub_lock); - if (!atomic_read(&fs_info->scrubs_running)) { - mutex_unlock(&fs_info->scrub_lock); - return -ENOTCONN; - } - - atomic_inc(&fs_info->scrub_cancel_req); - while (atomic_read(&fs_info->scrubs_running)) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - atomic_read(&fs_info->scrubs_running) == 0); - mutex_lock(&fs_info->scrub_lock); - } - atomic_dec(&fs_info->scrub_cancel_req); - mutex_unlock(&fs_info->scrub_lock); - - return 0; -} - -int btrfs_scrub_cancel(struct btrfs_root *root) -{ - return __btrfs_scrub_cancel(root->fs_info); -} - -int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct scrub_dev *sdev; - - mutex_lock(&fs_info->scrub_lock); - sdev = dev->scrub_device; - if (!sdev) { - mutex_unlock(&fs_info->scrub_lock); - return -ENOTCONN; - } - atomic_inc(&sdev->cancel_req); - while (dev->scrub_device) { - mutex_unlock(&fs_info->scrub_lock); - wait_event(fs_info->scrub_pause_wait, - dev->scrub_device == NULL); - mutex_lock(&fs_info->scrub_lock); - } - mutex_unlock(&fs_info->scrub_lock); - - return 0; -} - -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_device *dev; - int ret; - - /* - * we have to hold the device_list_mutex here so the device - * does not go away in cancel_dev. FIXME: find a better solution - */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); - if (!dev) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; - } - ret = btrfs_scrub_cancel_dev(root, dev); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - return ret; -} - -int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, - struct btrfs_scrub_progress *progress) -{ - struct btrfs_device *dev; - struct scrub_dev *sdev = NULL; - - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root, devid, NULL, NULL); - if (dev) - sdev = dev->scrub_device; - if (sdev) - memcpy(progress, &sdev->stat, sizeof(*progress)); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - - return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; -} diff --git a/ANDROID_3.4.5/fs/btrfs/struct-funcs.c b/ANDROID_3.4.5/fs/btrfs/struct-funcs.c deleted file mode 100644 index c6ffa581..00000000 --- a/ANDROID_3.4.5/fs/btrfs/struct-funcs.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/highmem.h> - -/* this is some deeply nasty code. ctree.h has a different - * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef - * - * The end result is that anyone who #includes ctree.h gets a - * declaration for the btrfs_set_foo functions and btrfs_foo functions - * - * This file declares the macros and then #includes ctree.h, which results - * in cpp creating the function here based on the template below. - * - * These setget functions do all the extent_buffer related mapping - * required to efficiently read and write specific fields in the extent - * buffers. Every pointer to metadata items in btrfs is really just - * an unsigned long offset into the extent buffer which has been - * cast to a specific type. This gives us all the gcc type checking. - * - * The extent buffer api is used to do all the kmapping and page - * spanning work required to get extent buffers in highmem and have - * a metadata blocksize different from the page size. - * - * The macro starts with a simple function prototype declaration so that - * sparse won't complain about it being static. - */ - -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ -void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ -void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token); \ -u##bits btrfs_token_##name(struct extent_buffer *eb, \ - type *s, struct btrfs_map_token *token) \ -{ \ - unsigned long part_offset = (unsigned long)s; \ - unsigned long offset = part_offset + offsetof(type, member); \ - type *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - unsigned long mem_len = sizeof(((type *)0)->member); \ - u##bits res; \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ - kaddr = token->kaddr; \ - p = (type *)(kaddr + part_offset - token->offset); \ - res = le##bits##_to_cpu(p->member); \ - return res; \ - } \ - err = map_private_extent_buffer(eb, offset, \ - mem_len, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits leres; \ - read_eb_member(eb, s, type, member, &leres); \ - return le##bits##_to_cpu(leres); \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - res = le##bits##_to_cpu(p->member); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ - } \ - return res; \ -} \ -void btrfs_set_token_##name(struct extent_buffer *eb, \ - type *s, u##bits val, struct btrfs_map_token *token) \ -{ \ - unsigned long part_offset = (unsigned long)s; \ - unsigned long offset = part_offset + offsetof(type, member); \ - type *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - unsigned long mem_len = sizeof(((type *)0)->member); \ - if (token && token->kaddr && token->offset <= offset && \ - token->eb == eb && \ - (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \ - kaddr = token->kaddr; \ - p = (type *)(kaddr + part_offset - token->offset); \ - p->member = cpu_to_le##bits(val); \ - return; \ - } \ - err = map_private_extent_buffer(eb, offset, \ - mem_len, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits val2; \ - val2 = cpu_to_le##bits(val); \ - write_eb_member(eb, s, type, member, &val2); \ - return; \ - } \ - p = (type *)(kaddr + part_offset - map_start); \ - p->member = cpu_to_le##bits(val); \ - if (token) { \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - token->eb = eb; \ - } \ -} \ -void btrfs_set_##name(struct extent_buffer *eb, \ - type *s, u##bits val) \ -{ \ - btrfs_set_token_##name(eb, s, val, NULL); \ -} \ -u##bits btrfs_##name(struct extent_buffer *eb, \ - type *s) \ -{ \ - return btrfs_token_##name(eb, s, NULL); \ -} \ - -#include "ctree.h" - -void btrfs_node_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr = btrfs_node_key_ptr_offset(nr); - read_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} diff --git a/ANDROID_3.4.5/fs/btrfs/super.c b/ANDROID_3.4.5/fs/btrfs/super.c deleted file mode 100644 index c5f8fca4..00000000 --- a/ANDROID_3.4.5/fs/btrfs/super.c +++ /dev/null @@ -1,1578 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/seq_file.h> -#include <linux/string.h> -#include <linux/backing-dev.h> -#include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/swap.h> -#include <linux/writeback.h> -#include <linux/statfs.h> -#include <linux/compat.h> -#include <linux/parser.h> -#include <linux/ctype.h> -#include <linux/namei.h> -#include <linux/miscdevice.h> -#include <linux/magic.h> -#include <linux/slab.h> -#include <linux/cleancache.h> -#include <linux/ratelimit.h> -#include "compat.h" -#include "delayed-inode.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "btrfs_inode.h" -#include "ioctl.h" -#include "print-tree.h" -#include "xattr.h" -#include "volumes.h" -#include "version.h" -#include "export.h" -#include "compression.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/btrfs.h> - -static const struct super_operations btrfs_super_ops; -static struct file_system_type btrfs_fs_type; - -static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, - char nbuf[16]) -{ - char *errstr = NULL; - - switch (errno) { - case -EIO: - errstr = "IO failure"; - break; - case -ENOMEM: - errstr = "Out of memory"; - break; - case -EROFS: - errstr = "Readonly filesystem"; - break; - case -EEXIST: - errstr = "Object already exists"; - break; - default: - if (nbuf) { - if (snprintf(nbuf, 16, "error %d", -errno) >= 0) - errstr = nbuf; - } - break; - } - - return errstr; -} - -static void __save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; -} - -/* NOTE: - * We move write_super stuff at umount in order to avoid deadlock - * for umount hold all lock. - */ -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - __save_error_info(fs_info); -} - -/* btrfs handle error by forcing the filesystem readonly */ -static void btrfs_handle_error(struct btrfs_fs_info *fs_info) -{ - struct super_block *sb = fs_info->sb; - - if (sb->s_flags & MS_RDONLY) - return; - - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - sb->s_flags |= MS_RDONLY; - printk(KERN_INFO "btrfs is forced readonly\n"); - __btrfs_scrub_cancel(fs_info); -// WARN_ON(1); - } -} - -/* - * __btrfs_std_error decodes expected errors from the caller and - * invokes the approciate error response. - */ -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - char nbuf[16]; - const char *errstr; - va_list args; - va_start(args, fmt); - - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; - - errstr = btrfs_decode_error(fs_info, errno, nbuf); - if (fmt) { - struct va_format vaf = { - .fmt = fmt, - .va = &args, - }; - - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", - sb->s_id, function, line, errstr, &vaf); - } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); - } - - /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); - btrfs_handle_error(fs_info); - } - va_end(args); -} - -const char *logtypes[] = { - "emergency", - "alert", - "critical", - "error", - "warning", - "notice", - "info", - "debug", -}; - -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - char lvl[4]; - struct va_format vaf; - va_list args; - const char *type = logtypes[4]; - - va_start(args, fmt); - - if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') { - strncpy(lvl, fmt, 3); - fmt += 3; - type = logtypes[fmt[1] - '0']; - } else - *lvl = '\0'; - - vaf.fmt = fmt; - vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); -} - -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, - unsigned int line, int errno) -{ - WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted"); - trans->aborted = errno; - /* Nothing used. The other threads that have joined this - * transaction may be able to continue. */ - if (!trans->blocks_used) { - btrfs_printk(root->fs_info, "Aborting unused transaction.\n"); - return; - } - trans->transaction->aborted = errno; - __btrfs_std_error(root->fs_info, function, line, errno, NULL); -} -/* - * __btrfs_panic decodes unexpected, fatal errors from the caller, - * issues an alert, and either panics or BUGs, depending on mount options. - */ -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - char nbuf[16]; - char *s_id = "<unknown>"; - const char *errstr; - struct va_format vaf = { .fmt = fmt }; - va_list args; - - if (fs_info) - s_id = fs_info->sb->s_id; - - va_start(args, fmt); - vaf.va = &args; - - errstr = btrfs_decode_error(fs_info, errno, nbuf); - if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", - s_id, function, line, &vaf, errstr); - - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", - s_id, function, line, &vaf, errstr); - va_end(args); - /* Caller calls BUG() */ -} - -static void btrfs_put_super(struct super_block *sb) -{ - (void)close_ctree(btrfs_sb(sb)->tree_root); - /* FIXME: need to fix VFS to return error? */ - /* AV: return it _where_? ->put_super() can be triggered by any number - * of async events, up to and including delivery of SIGKILL to the - * last process that kept it busy. Or segfault in the aforementioned - * process... Whom would you report that to? - */ -} - -enum { - Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, - Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, - Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_type, Opt_compress_force, Opt_compress_force_type, - Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_fatal_errors, - Opt_err, -}; - -static match_table_t tokens = { - {Opt_degraded, "degraded"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvolid, "subvolid=%d"}, - {Opt_device, "device=%s"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_alloc_start, "alloc_start=%s"}, - {Opt_thread_pool, "thread_pool=%d"}, - {Opt_compress, "compress"}, - {Opt_compress_type, "compress=%s"}, - {Opt_compress_force, "compress-force"}, - {Opt_compress_force_type, "compress-force=%s"}, - {Opt_ssd, "ssd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd, "nossd"}, - {Opt_noacl, "noacl"}, - {Opt_notreelog, "notreelog"}, - {Opt_flushoncommit, "flushoncommit"}, - {Opt_ratio, "metadata_ratio=%d"}, - {Opt_discard, "discard"}, - {Opt_space_cache, "space_cache"}, - {Opt_clear_cache, "clear_cache"}, - {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, - {Opt_enospc_debug, "enospc_debug"}, - {Opt_subvolrootid, "subvolrootid=%d"}, - {Opt_defrag, "autodefrag"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_recovery, "recovery"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_err, NULL}, -}; - -/* - * Regular mount options parser. Everything that is needed only when - * reading in a new superblock is parsed here. - * XXX JDM: This needs to be cleaned up for remount. - */ -int btrfs_parse_options(struct btrfs_root *root, char *options) -{ - struct btrfs_fs_info *info = root->fs_info; - substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig = NULL; - u64 cache_gen; - int intarg; - int ret = 0; - char *compress_type; - bool compress_force = false; - - cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (cache_gen) - btrfs_set_opt(info->mount_opt, SPACE_CACHE); - - if (!options) - goto out; - - /* - * strsep changes the string, duplicate it because parse_options - * gets called twice - */ - options = kstrdup(options, GFP_NOFS); - if (!options) - return -ENOMEM; - - orig = options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_degraded: - printk(KERN_INFO "btrfs: allowing degraded mounts\n"); - btrfs_set_opt(info->mount_opt, DEGRADED); - break; - case Opt_subvol: - case Opt_subvolid: - case Opt_subvolrootid: - case Opt_device: - /* - * These are parsed by btrfs_parse_early_options - * and can be happily ignored here. - */ - break; - case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatasum\n"); - btrfs_set_opt(info->mount_opt, NODATASUM); - break; - case Opt_nodatacow: - printk(KERN_INFO "btrfs: setting nodatacow\n"); - btrfs_set_opt(info->mount_opt, NODATACOW); - btrfs_set_opt(info->mount_opt, NODATASUM); - break; - case Opt_compress_force: - case Opt_compress_force_type: - compress_force = true; - case Opt_compress: - case Opt_compress_type: - if (token == Opt_compress || - token == Opt_compress_force || - strcmp(args[0].from, "zlib") == 0) { - compress_type = "zlib"; - info->compress_type = BTRFS_COMPRESS_ZLIB; - } else if (strcmp(args[0].from, "lzo") == 0) { - compress_type = "lzo"; - info->compress_type = BTRFS_COMPRESS_LZO; - } else { - ret = -EINVAL; - goto out; - } - - btrfs_set_opt(info->mount_opt, COMPRESS); - if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - pr_info("btrfs: force %s compression\n", - compress_type); - } else - pr_info("btrfs: use %s compression\n", - compress_type); - break; - case Opt_ssd: - printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - break; - case Opt_ssd_spread: - printk(KERN_INFO "btrfs: use spread ssd " - "allocation scheme\n"); - btrfs_set_opt(info->mount_opt, SSD); - btrfs_set_opt(info->mount_opt, SSD_SPREAD); - break; - case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation " - "scheme\n"); - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); - break; - case Opt_nobarrier: - printk(KERN_INFO "btrfs: turning off barriers\n"); - btrfs_set_opt(info->mount_opt, NOBARRIER); - break; - case Opt_thread_pool: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { - info->thread_pool_size = intarg; - printk(KERN_INFO "btrfs: thread pool %d\n", - info->thread_pool_size); - } - break; - case Opt_max_inline: - num = match_strdup(&args[0]); - if (num) { - info->max_inline = memparse(num, NULL); - kfree(num); - - if (info->max_inline) { - info->max_inline = max_t(u64, - info->max_inline, - root->sectorsize); - } - printk(KERN_INFO "btrfs: max_inline at %llu\n", - (unsigned long long)info->max_inline); - } - break; - case Opt_alloc_start: - num = match_strdup(&args[0]); - if (num) { - info->alloc_start = memparse(num, NULL); - kfree(num); - printk(KERN_INFO - "btrfs: allocations start at %llu\n", - (unsigned long long)info->alloc_start); - } - break; - case Opt_noacl: - root->fs_info->sb->s_flags &= ~MS_POSIXACL; - break; - case Opt_notreelog: - printk(KERN_INFO "btrfs: disabling tree log\n"); - btrfs_set_opt(info->mount_opt, NOTREELOG); - break; - case Opt_flushoncommit: - printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); - btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); - break; - case Opt_ratio: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { - info->metadata_ratio = intarg; - printk(KERN_INFO "btrfs: metadata ratio %d\n", - info->metadata_ratio); - } - break; - case Opt_discard: - btrfs_set_opt(info->mount_opt, DISCARD); - break; - case Opt_space_cache: - btrfs_set_opt(info->mount_opt, SPACE_CACHE); - break; - case Opt_no_space_cache: - printk(KERN_INFO "btrfs: disabling disk space caching\n"); - btrfs_clear_opt(info->mount_opt, SPACE_CACHE); - break; - case Opt_inode_cache: - printk(KERN_INFO "btrfs: enabling inode map caching\n"); - btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); - break; - case Opt_clear_cache: - printk(KERN_INFO "btrfs: force clearing of disk cache\n"); - btrfs_set_opt(info->mount_opt, CLEAR_CACHE); - break; - case Opt_user_subvol_rm_allowed: - btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); - break; - case Opt_enospc_debug: - btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); - break; - case Opt_defrag: - printk(KERN_INFO "btrfs: enabling auto defrag"); - btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); - break; - case Opt_recovery: - printk(KERN_INFO "btrfs: enabling auto recovery"); - btrfs_set_opt(info->mount_opt, RECOVERY); - break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); - break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - printk(KERN_INFO "btrfs: enabling check integrity" - " including extent data\n"); - btrfs_set_opt(info->mount_opt, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity: - printk(KERN_INFO "btrfs: enabling check integrity\n"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity_print_mask: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { - info->check_integrity_print_mask = intarg; - printk(KERN_INFO "btrfs:" - " check_integrity_print_mask 0x%x\n", - info->check_integrity_print_mask); - } - break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - printk(KERN_ERR "btrfs: support for check_integrity*" - " not compiled in!\n"); - ret = -EINVAL; - goto out; -#endif - case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) - btrfs_set_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) - btrfs_clear_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - else { - ret = -EINVAL; - goto out; - } - break; - case Opt_err: - printk(KERN_INFO "btrfs: unrecognized mount option " - "'%s'\n", p); - ret = -EINVAL; - goto out; - default: - break; - } - } -out: - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); - kfree(orig); - return ret; -} - -/* - * Parse mount options that are required early in the mount process. - * - * All other options will be parsed on much later in the mount process and - * only when we need to allocate a new super block. - */ -static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, u64 *subvol_objectid, - u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) -{ - substring_t args[MAX_OPT_ARGS]; - char *device_name, *opts, *orig, *p; - int error = 0; - int intarg; - - if (!options) - return 0; - - /* - * strsep changes the string, duplicate it because parse_options - * gets called twice - */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; - - while ((p = strsep(&opts, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_subvol: - kfree(*subvol_name); - *subvol_name = match_strdup(&args[0]); - break; - case Opt_subvolid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { - /* we want the original fs_tree */ - if (!intarg) - *subvol_objectid = - BTRFS_FS_TREE_OBJECTID; - else - *subvol_objectid = intarg; - } - break; - case Opt_subvolrootid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { - /* we want the original fs_tree */ - if (!intarg) - *subvol_rootid = - BTRFS_FS_TREE_OBJECTID; - else - *subvol_rootid = intarg; - } - break; - case Opt_device: - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); - kfree(device_name); - if (error) - goto out; - break; - default: - break; - } - } - -out: - kfree(orig); - return error; -} - -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; - - /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. - */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; - } - - path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); - path->leave_spinning = 1; - - /* - * Find the "default" dir item which points to the root item that we - * will mount by default if we haven't been given a specific subvolume - * to mount. - */ - dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); - if (IS_ERR(di)) { - btrfs_free_path(path); - return ERR_CAST(di); - } - if (!di) { - /* - * Ok the default dir item isn't there. This is weird since - * it's always been there, but don't freak out, just try and - * mount to root most subvolume. - */ - btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; - } - - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - if (btrfs_root_refs(&new_root->root_item) == 0) - return ERR_PTR(-ENOENT); - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && sb->s_root->d_inode == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_alias(inode); -} - -static int btrfs_fill_super(struct super_block *sb, - struct btrfs_fs_devices *fs_devices, - void *data, int silent) -{ - struct inode *inode; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_key key; - int err; - - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_magic = BTRFS_SUPER_MAGIC; - sb->s_op = &btrfs_super_ops; - sb->s_d_op = &btrfs_dentry_operations; - sb->s_export_op = &btrfs_export_ops; - sb->s_xattr = btrfs_xattr_handlers; - sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; -#endif - - err = open_ctree(sb, fs_devices, (char *)data); - if (err) { - printk("btrfs: open_ctree failed\n"); - return err; - } - - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto fail_close; - } - - sb->s_root = d_make_root(inode); - if (!sb->s_root) { - err = -ENOMEM; - goto fail_close; - } - - save_mount_options(sb, data); - cleancache_init_fs(sb); - sb->s_flags |= MS_ACTIVE; - return 0; - -fail_close: - close_ctree(fs_info->tree_root); - return err; -} - -int btrfs_sync_fs(struct super_block *sb, int wait) -{ - struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; - int ret; - - trace_btrfs_sync_fs(wait); - - if (!wait) { - filemap_flush(fs_info->btree_inode->i_mapping); - return 0; - } - - btrfs_wait_ordered_extents(root, 0, 0); - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - return ret; -} - -static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) -{ - struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = info->tree_root; - char *compress_type; - - if (btrfs_test_opt(root, DEGRADED)) - seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) - seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) - seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) - seq_puts(seq, ",nobarrier"); - if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", - (unsigned long long)info->max_inline); - if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", - (unsigned long long)info->alloc_start); - if (info->thread_pool_size != min_t(unsigned long, - num_online_cpus() + 2, 8)) - seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { - if (info->compress_type == BTRFS_COMPRESS_ZLIB) - compress_type = "zlib"; - else - compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) - seq_printf(seq, ",compress-force=%s", compress_type); - else - seq_printf(seq, ",compress=%s", compress_type); - } - if (btrfs_test_opt(root, NOSSD)) - seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) - seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) - seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) - seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) - seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) - seq_puts(seq, ",discard"); - if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) - seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) - seq_puts(seq, ",space_cache"); - else - seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, CLEAR_CACHE)) - seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) - seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) - seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) - seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) - seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) - seq_puts(seq, ",skip_balance"); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) - seq_puts(seq, ",fatal_errors=panic"); - return 0; -} - -static int btrfs_test_super(struct super_block *s, void *data) -{ - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); - - return fs_info->fs_devices == p->fs_devices; -} - -static int btrfs_set_super(struct super_block *s, void *data) -{ - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; -} - -/* - * subvolumes are identified by ino 256 - */ -static inline int is_subvolume_inode(struct inode *inode) -{ - if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - return 0; -} - -/* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. - */ -static char *setup_root_args(char *args) -{ - unsigned copied = 0; - unsigned len = strlen(args) + 2; - char *pos; - char *ret; - - /* - * We need the same args as before, but minus - * - * subvol=a - * - * and add - * - * subvolid=0 - * - * which is a difference of 2 characters, so we allocate strlen(args) + - * 2 characters. - */ - ret = kzalloc(len * sizeof(char), GFP_NOFS); - if (!ret) - return NULL; - pos = strstr(args, "subvol="); - - /* This shouldn't happen, but just in case.. */ - if (!pos) { - kfree(ret); - return NULL; - } - - /* - * The subvol=<> arg is not at the front of the string, copy everybody - * up to that into ret. - */ - if (pos != args) { - *pos = '\0'; - strcpy(ret, args); - copied += strlen(args); - pos++; - } - - strncpy(ret + copied, "subvolid=0", len - copied); - - /* Length of subvolid=0 */ - copied += 10; - - /* - * If there is no , after the subvol= option then we know there's no - * other options and we can just return. - */ - pos = strchr(pos, ','); - if (!pos) - return ret; - - /* Copy the rest of the arguments into our buffer */ - strncpy(ret + copied, pos, len - copied); - copied += strlen(pos); - - return ret; -} - -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) -{ - struct dentry *root; - struct vfsmount *mnt; - char *newargs; - - newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); - kfree(newargs); - if (IS_ERR(mnt)) - return ERR_CAST(mnt); - - root = mount_subtree(mnt, subvol_name); - - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { - struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", - subvol_name); - } - - return root; -} - -/* - * Find a superblock for the given device / mount point. - * - * Note: This is based on get_sb_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. - */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) -{ - struct block_device *bdev = NULL; - struct super_block *s; - struct dentry *root; - struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_fs_info *fs_info = NULL; - fmode_t mode = FMODE_READ; - char *subvol_name = NULL; - u64 subvol_objectid = 0; - u64 subvol_rootid = 0; - int error = 0; - - if (!(flags & MS_RDONLY)) - mode |= FMODE_WRITE; - - error = btrfs_parse_early_options(data, mode, fs_type, - &subvol_name, &subvol_objectid, - &subvol_rootid, &fs_devices); - if (error) { - kfree(subvol_name); - return ERR_PTR(error); - } - - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; - } - - error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); - if (error) - return ERR_PTR(error); - - /* - * Setup a dummy root and fs_info for test/set super. This is because - * we don't actually fill this stuff out until open_ctree, but we need - * it for searching for existing supers, so this lets us do that and - * then open_ctree will properly initialize everything later. - */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) - return ERR_PTR(-ENOMEM); - - fs_info->fs_devices = fs_devices; - - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); - if (!fs_info->super_copy || !fs_info->super_for_commit) { - error = -ENOMEM; - goto error_fs_info; - } - - error = btrfs_open_devices(fs_devices, mode, fs_type); - if (error) - goto error_fs_info; - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - - bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto error_close_devices; - } - - if (s->s_root) { - btrfs_close_devices(fs_devices); - free_fs_info(fs_info); - if ((flags ^ s->s_flags) & MS_RDONLY) - error = -EBUSY; - } else { - char b[BDEVNAME_SIZE]; - - s->s_flags = flags | MS_NOSEC; - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data, - flags & MS_SILENT ? 1 : 0); - } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) - deactivate_locked_super(s); - - return root; - -error_close_devices: - btrfs_close_devices(fs_devices); -error_fs_info: - free_fs_info(fs_info); - return ERR_PTR(error); -} - -static int btrfs_remount(struct super_block *sb, int *flags, char *data) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_root *root = fs_info->tree_root; - unsigned old_flags = sb->s_flags; - unsigned long old_opts = fs_info->mount_opt; - unsigned long old_compress_type = fs_info->compress_type; - u64 old_max_inline = fs_info->max_inline; - u64 old_alloc_start = fs_info->alloc_start; - int old_thread_pool_size = fs_info->thread_pool_size; - unsigned int old_metadata_ratio = fs_info->metadata_ratio; - int ret; - - ret = btrfs_parse_options(root, data); - if (ret) { - ret = -EINVAL; - goto restore; - } - - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) - return 0; - - if (*flags & MS_RDONLY) { - sb->s_flags |= MS_RDONLY; - - ret = btrfs_commit_super(root); - if (ret) - goto restore; - } else { - if (fs_info->fs_devices->rw_devices == 0) { - ret = -EACCES; - goto restore; - } - - if (btrfs_super_log_root(fs_info->super_copy) != 0) { - ret = -EINVAL; - goto restore; - } - - ret = btrfs_cleanup_fs_roots(fs_info); - if (ret) - goto restore; - - /* recover relocation */ - ret = btrfs_recover_relocation(root); - if (ret) - goto restore; - - sb->s_flags &= ~MS_RDONLY; - } - - return 0; - -restore: - /* We've hit an error - don't reset MS_RDONLY */ - if (sb->s_flags & MS_RDONLY) - old_flags |= MS_RDONLY; - sb->s_flags = old_flags; - fs_info->mount_opt = old_opts; - fs_info->compress_type = old_compress_type; - fs_info->max_inline = old_max_inline; - fs_info->alloc_start = old_alloc_start; - fs_info->thread_pool_size = old_thread_pool_size; - fs_info->metadata_ratio = old_metadata_ratio; - return ret; -} - -/* Used to sort the devices by max_avail(descending sort) */ -static int btrfs_cmp_device_free_bytes(const void *dev_info1, - const void *dev_info2) -{ - if (((struct btrfs_device_info *)dev_info1)->max_avail > - ((struct btrfs_device_info *)dev_info2)->max_avail) - return -1; - else if (((struct btrfs_device_info *)dev_info1)->max_avail < - ((struct btrfs_device_info *)dev_info2)->max_avail) - return 1; - else - return 0; -} - -/* - * sort the devices by max_avail, in which max free extent size of each device - * is stored.(Descending Sort) - */ -static inline void btrfs_descending_sort_devices( - struct btrfs_device_info *devices, - size_t nr_devices) -{ - sort(devices, nr_devices, sizeof(struct btrfs_device_info), - btrfs_cmp_device_free_bytes, NULL); -} - -/* - * The helper to calc the free space on the devices that can be used to store - * file data. - */ -static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_device_info *devices_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *device; - u64 skip_space; - u64 type; - u64 avail_space; - u64 used_space; - u64 min_stripe_size; - int min_stripes = 1, num_stripes = 1; - int i = 0, nr_devices; - int ret; - - nr_devices = fs_info->fs_devices->open_devices; - BUG_ON(!nr_devices); - - devices_info = kmalloc(sizeof(*devices_info) * nr_devices, - GFP_NOFS); - if (!devices_info) - return -ENOMEM; - - /* calc min stripe number for data space alloction */ - type = btrfs_get_alloc_profile(root, 1); - if (type & BTRFS_BLOCK_GROUP_RAID0) { - min_stripes = 2; - num_stripes = nr_devices; - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { - min_stripes = 2; - num_stripes = 2; - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { - min_stripes = 4; - num_stripes = 4; - } - - if (type & BTRFS_BLOCK_GROUP_DUP) - min_stripe_size = 2 * BTRFS_STRIPE_LEN; - else - min_stripe_size = BTRFS_STRIPE_LEN; - - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev) - continue; - - avail_space = device->total_bytes - device->bytes_used; - - /* align with stripe_len */ - do_div(avail_space, BTRFS_STRIPE_LEN); - avail_space *= BTRFS_STRIPE_LEN; - - /* - * In order to avoid overwritting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - */ - skip_space = 1024 * 1024; - - /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) - skip_space = max(fs_info->alloc_start, skip_space); - - /* - * btrfs can not use the free space in [0, skip_space - 1], - * we must subtract it from the total. In order to implement - * it, we account the used space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - return ret; - } - - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; - - /* - * we can use the free space in [0, skip_space - 1], subtract - * it from the total. - */ - if (avail_space && avail_space >= skip_space) - avail_space -= skip_space; - else - avail_space = 0; - - if (avail_space < min_stripe_size) - continue; - - devices_info[i].dev = device; - devices_info[i].max_avail = avail_space; - - i++; - } - - nr_devices = i; - - btrfs_descending_sort_devices(devices_info, nr_devices); - - i = nr_devices - 1; - avail_space = 0; - while (nr_devices >= min_stripes) { - if (num_stripes > nr_devices) - num_stripes = nr_devices; - - if (devices_info[i].max_avail >= min_stripe_size) { - int j; - u64 alloc_size; - - avail_space += devices_info[i].max_avail * num_stripes; - alloc_size = devices_info[i].max_avail; - for (j = i + 1 - num_stripes; j <= i; j++) - devices_info[j].max_avail -= alloc_size; - } - i--; - nr_devices--; - } - - kfree(devices_info); - *free_bytes = avail_space; - return 0; -} - -static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = fs_info->super_copy; - struct list_head *head = &fs_info->space_info; - struct btrfs_space_info *found; - u64 total_used = 0; - u64 total_free_data = 0; - int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; - int ret; - - /* holding chunk_muext to avoid allocating new chunks */ - mutex_lock(&fs_info->chunk_mutex); - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & BTRFS_BLOCK_GROUP_DATA) { - total_free_data += found->disk_total - found->disk_used; - total_free_data -= - btrfs_account_ro_block_groups_free_space(found); - } - - total_used += found->disk_used; - } - rcu_read_unlock(); - - buf->f_namelen = BTRFS_NAME_LEN; - buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; - buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_type = BTRFS_SUPER_MAGIC; - buf->f_bavail = total_free_data; - ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); - if (ret) { - mutex_unlock(&fs_info->chunk_mutex); - return ret; - } - buf->f_bavail += total_free_data; - buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&fs_info->chunk_mutex); - - /* We treat it as constant endianness (it doesn't matter _which_) - because we want the fsid to come out the same whether mounted - on a big-endian or little-endian host */ - buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); - buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); - /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; - - return 0; -} - -static void btrfs_kill_super(struct super_block *sb) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - kill_anon_super(sb); - free_fs_info(fs_info); -} - -static struct file_system_type btrfs_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV, -}; - -/* - * used by btrfsctl to scan devices when no FS is mounted - */ -static long btrfs_control_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct btrfs_ioctl_vol_args *vol; - struct btrfs_fs_devices *fs_devices; - int ret = -ENOTTY; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - vol = memdup_user((void __user *)arg, sizeof(*vol)); - if (IS_ERR(vol)) - return PTR_ERR(vol); - - switch (cmd) { - case BTRFS_IOC_SCAN_DEV: - ret = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_fs_type, &fs_devices); - break; - } - - kfree(vol); - return ret; -} - -static int btrfs_freeze(struct super_block *sb) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_lock(&fs_info->transaction_kthread_mutex); - mutex_lock(&fs_info->cleaner_mutex); - return 0; -} - -static int btrfs_unfreeze(struct super_block *sb) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - mutex_unlock(&fs_info->cleaner_mutex); - mutex_unlock(&fs_info->transaction_kthread_mutex); - return 0; -} - -static void btrfs_fs_dirty_inode(struct inode *inode, int flags) -{ - int ret; - - ret = btrfs_dirty_inode(inode); - if (ret) - printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu " - "error %d\n", btrfs_ino(inode), ret); -} - -static const struct super_operations btrfs_super_ops = { - .drop_inode = btrfs_drop_inode, - .evict_inode = btrfs_evict_inode, - .put_super = btrfs_put_super, - .sync_fs = btrfs_sync_fs, - .show_options = btrfs_show_options, - .write_inode = btrfs_write_inode, - .dirty_inode = btrfs_fs_dirty_inode, - .alloc_inode = btrfs_alloc_inode, - .destroy_inode = btrfs_destroy_inode, - .statfs = btrfs_statfs, - .remount_fs = btrfs_remount, - .freeze_fs = btrfs_freeze, - .unfreeze_fs = btrfs_unfreeze, -}; - -static const struct file_operations btrfs_ctl_fops = { - .unlocked_ioctl = btrfs_control_ioctl, - .compat_ioctl = btrfs_control_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice btrfs_misc = { - .minor = BTRFS_MINOR, - .name = "btrfs-control", - .fops = &btrfs_ctl_fops -}; - -MODULE_ALIAS_MISCDEV(BTRFS_MINOR); -MODULE_ALIAS("devname:btrfs-control"); - -static int btrfs_interface_init(void) -{ - return misc_register(&btrfs_misc); -} - -static void btrfs_interface_exit(void) -{ - if (misc_deregister(&btrfs_misc) < 0) - printk(KERN_INFO "misc_deregister failed for control device"); -} - -static int __init init_btrfs_fs(void) -{ - int err; - - err = btrfs_init_sysfs(); - if (err) - return err; - - btrfs_init_compress(); - - err = btrfs_init_cachep(); - if (err) - goto free_compress; - - err = extent_io_init(); - if (err) - goto free_cachep; - - err = extent_map_init(); - if (err) - goto free_extent_io; - - err = btrfs_delayed_inode_init(); - if (err) - goto free_extent_map; - - err = btrfs_interface_init(); - if (err) - goto free_delayed_inode; - - err = register_filesystem(&btrfs_fs_type); - if (err) - goto unregister_ioctl; - - btrfs_init_lockdep(); - - printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); - return 0; - -unregister_ioctl: - btrfs_interface_exit(); -free_delayed_inode: - btrfs_delayed_inode_exit(); -free_extent_map: - extent_map_exit(); -free_extent_io: - extent_io_exit(); -free_cachep: - btrfs_destroy_cachep(); -free_compress: - btrfs_exit_compress(); - btrfs_exit_sysfs(); - return err; -} - -static void __exit exit_btrfs_fs(void) -{ - btrfs_destroy_cachep(); - btrfs_delayed_inode_exit(); - extent_map_exit(); - extent_io_exit(); - btrfs_interface_exit(); - unregister_filesystem(&btrfs_fs_type); - btrfs_exit_sysfs(); - btrfs_cleanup_fs_uuids(); - btrfs_exit_compress(); -} - -module_init(init_btrfs_fs) -module_exit(exit_btrfs_fs) - -MODULE_LICENSE("GPL"); diff --git a/ANDROID_3.4.5/fs/btrfs/sysfs.c b/ANDROID_3.4.5/fs/btrfs/sysfs.c deleted file mode 100644 index daac9ae6..00000000 --- a/ANDROID_3.4.5/fs/btrfs/sysfs.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/module.h> -#include <linux/kobject.h> - -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" - -/* /sys/fs/btrfs/ entry */ -static struct kset *btrfs_kset; - -int btrfs_init_sysfs(void) -{ - btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); - if (!btrfs_kset) - return -ENOMEM; - return 0; -} - -void btrfs_exit_sysfs(void) -{ - kset_unregister(btrfs_kset); -} - diff --git a/ANDROID_3.4.5/fs/btrfs/transaction.c b/ANDROID_3.4.5/fs/btrfs/transaction.c deleted file mode 100644 index 36422254..00000000 --- a/ANDROID_3.4.5/fs/btrfs/transaction.c +++ /dev/null @@ -1,1539 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/writeback.h> -#include <linux/pagemap.h> -#include <linux/blkdev.h> -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "locking.h" -#include "tree-log.h" -#include "inode-map.h" - -#define BTRFS_ROOT_TRANS_TAG 0 - -void put_transaction(struct btrfs_transaction *transaction) -{ - WARN_ON(atomic_read(&transaction->use_count) == 0); - if (atomic_dec_and_test(&transaction->use_count)) { - BUG_ON(!list_empty(&transaction->list)); - WARN_ON(transaction->delayed_refs.root.rb_node); - WARN_ON(!list_empty(&transaction->delayed_refs.seq_head)); - memset(transaction, 0, sizeof(*transaction)); - kmem_cache_free(btrfs_transaction_cachep, transaction); - } -} - -static noinline void switch_commit_root(struct btrfs_root *root) -{ - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); -} - -/* - * either allocate a new transaction or hop into the existing one - */ -static noinline int join_transaction(struct btrfs_root *root, int nofail) -{ - struct btrfs_transaction *cur_trans; - - spin_lock(&root->fs_info->trans_lock); -loop: - /* The file system has been taken offline. No new transactions. */ - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - spin_unlock(&root->fs_info->trans_lock); - return -EROFS; - } - - if (root->fs_info->trans_no_join) { - if (!nofail) { - spin_unlock(&root->fs_info->trans_lock); - return -EBUSY; - } - } - - cur_trans = root->fs_info->running_transaction; - if (cur_trans) { - if (cur_trans->aborted) { - spin_unlock(&root->fs_info->trans_lock); - return cur_trans->aborted; - } - atomic_inc(&cur_trans->use_count); - atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); - return 0; - } - spin_unlock(&root->fs_info->trans_lock); - - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); - if (!cur_trans) - return -ENOMEM; - - spin_lock(&root->fs_info->trans_lock); - if (root->fs_info->running_transaction) { - /* - * someone started a transaction after we unlocked. Make sure - * to redo the trans_no_join checks above - */ - kmem_cache_free(btrfs_transaction_cachep, cur_trans); - cur_trans = root->fs_info->running_transaction; - goto loop; - } - - atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; - init_waitqueue_head(&cur_trans->writer_wait); - init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; - /* - * One for this trans handle, one so it will live on until we - * commit the transaction. - */ - atomic_set(&cur_trans->use_count, 2); - cur_trans->commit_done = 0; - cur_trans->start_time = get_seconds(); - - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.seq = 1; - init_waitqueue_head(&cur_trans->delayed_refs.seq_wait); - spin_lock_init(&cur_trans->commit_lock); - spin_lock_init(&cur_trans->delayed_refs.lock); - INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head); - - INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping); - root->fs_info->generation++; - cur_trans->transid = root->fs_info->generation; - root->fs_info->running_transaction = cur_trans; - cur_trans->aborted = 0; - spin_unlock(&root->fs_info->trans_lock); - - return 0; -} - -/* - * this does all the record keeping required to make sure that a reference - * counted root is properly recorded in a given transaction. This is required - * to make sure the old root from before we joined the transaction is deleted - * when the transaction commits - */ -static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (root->ref_cows && root->last_trans < trans->transid) { - WARN_ON(root == root->fs_info->extent_root); - WARN_ON(root->commit_root != root->node); - - /* - * see below for in_trans_setup usage rules - * we have the reloc mutex held now, so there - * is only one writer in this function - */ - root->in_trans_setup = 1; - - /* make sure readers find in_trans_setup before - * they find our root->last_trans update - */ - smp_wmb(); - - spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { - spin_unlock(&root->fs_info->fs_roots_radix_lock); - return 0; - } - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&root->fs_info->fs_roots_radix_lock); - root->last_trans = trans->transid; - - /* this is pretty tricky. We don't want to - * take the relocation lock in btrfs_record_root_in_trans - * unless we're really doing the first setup for this root in - * this transaction. - * - * Normally we'd use root->last_trans as a flag to decide - * if we want to take the expensive mutex. - * - * But, we have to set root->last_trans before we - * init the relocation root, otherwise, we trip over warnings - * in ctree.c. The solution used here is to flag ourselves - * with root->in_trans_setup. When this is 1, we're still - * fixing up the reloc trees and everyone must wait. - * - * When this is zero, they can trust root->last_trans and fly - * through btrfs_record_root_in_trans without having to take the - * lock. smp_wmb() makes sure that all the writes above are - * done before we pop in the zero below - */ - btrfs_init_reloc_root(trans, root); - smp_wmb(); - root->in_trans_setup = 0; - } - return 0; -} - - -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!root->ref_cows) - return 0; - - /* - * see record_root_in_trans for comments about in_trans_setup usage - * and barriers - */ - smp_rmb(); - if (root->last_trans == trans->transid && - !root->in_trans_setup) - return 0; - - mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); - mutex_unlock(&root->fs_info->reloc_mutex); - - return 0; -} - -/* wait for commit against the current transaction to become unblocked - * when this is done, it is safe to start a new transaction, but the current - * transaction might not be fully on disk. - */ -static void wait_current_trans(struct btrfs_root *root) -{ - struct btrfs_transaction *cur_trans; - - spin_lock(&root->fs_info->trans_lock); - cur_trans = root->fs_info->running_transaction; - if (cur_trans && cur_trans->blocked) { - atomic_inc(&cur_trans->use_count); - spin_unlock(&root->fs_info->trans_lock); - - wait_event(root->fs_info->transaction_wait, - !cur_trans->blocked); - put_transaction(cur_trans); - } else { - spin_unlock(&root->fs_info->trans_lock); - } -} - -enum btrfs_trans_type { - TRANS_START, - TRANS_JOIN, - TRANS_USERSPACE, - TRANS_JOIN_NOLOCK, -}; - -static int may_wait_transaction(struct btrfs_root *root, int type) -{ - if (root->fs_info->log_root_recovering) - return 0; - - if (type == TRANS_USERSPACE) - return 1; - - if (type == TRANS_START && - !atomic_read(&root->fs_info->open_ioctl_trans)) - return 1; - - return 0; -} - -static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - u64 num_items, int type) -{ - struct btrfs_trans_handle *h; - struct btrfs_transaction *cur_trans; - u64 num_bytes = 0; - int ret; - - if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) - return ERR_PTR(-EROFS); - - if (current->journal_info) { - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); - h = current->journal_info; - h->use_count++; - h->orig_rsv = h->block_rsv; - h->block_rsv = NULL; - goto got_it; - } - - /* - * Do the reservation before we join the transaction so we can do all - * the appropriate flushing if need be. - */ - if (num_items > 0 && root != root->fs_info->chunk_root) { - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, - num_bytes); - if (ret) - return ERR_PTR(ret); - } -again: - h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); - - if (may_wait_transaction(root, type)) - wait_current_trans(root); - - do { - ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); - if (ret == -EBUSY) - wait_current_trans(root); - } while (ret == -EBUSY); - - if (ret < 0) { - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); - } - - cur_trans = root->fs_info->running_transaction; - - h->transid = cur_trans->transid; - h->transaction = cur_trans; - h->blocks_used = 0; - h->bytes_reserved = 0; - h->delayed_ref_updates = 0; - h->use_count = 1; - h->block_rsv = NULL; - h->orig_rsv = NULL; - h->aborted = 0; - - smp_mb(); - if (cur_trans->blocked && may_wait_transaction(root, type)) { - btrfs_commit_transaction(h, root); - goto again; - } - - if (num_bytes) { - trace_btrfs_space_reservation(root->fs_info, "transaction", - h->transid, num_bytes, 1); - h->block_rsv = &root->fs_info->trans_block_rsv; - h->bytes_reserved = num_bytes; - } - -got_it: - btrfs_record_root_in_trans(h, root); - - if (!current->journal_info && type != TRANS_USERSPACE) - current->journal_info = h; - return h; -} - -struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items) -{ - return start_transaction(root, num_items, TRANS_START); -} -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) -{ - return start_transaction(root, 0, TRANS_JOIN); -} - -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) -{ - return start_transaction(root, 0, TRANS_JOIN_NOLOCK); -} - -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) -{ - return start_transaction(root, 0, TRANS_USERSPACE); -} - -/* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_root *root, - struct btrfs_transaction *commit) -{ - wait_event(commit->commit_wait, commit->commit_done); -} - -int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) -{ - struct btrfs_transaction *cur_trans = NULL, *t; - int ret; - - ret = 0; - if (transid) { - if (transid <= root->fs_info->last_trans_committed) - goto out; - - /* find specified transaction */ - spin_lock(&root->fs_info->trans_lock); - list_for_each_entry(t, &root->fs_info->trans_list, list) { - if (t->transid == transid) { - cur_trans = t; - atomic_inc(&cur_trans->use_count); - break; - } - if (t->transid > transid) - break; - } - spin_unlock(&root->fs_info->trans_lock); - ret = -EINVAL; - if (!cur_trans) - goto out; /* bad transid */ - } else { - /* find newest transaction that is committing | committed */ - spin_lock(&root->fs_info->trans_lock); - list_for_each_entry_reverse(t, &root->fs_info->trans_list, - list) { - if (t->in_commit) { - if (t->commit_done) - break; - cur_trans = t; - atomic_inc(&cur_trans->use_count); - break; - } - } - spin_unlock(&root->fs_info->trans_lock); - if (!cur_trans) - goto out; /* nothing committing|committed */ - } - - wait_for_commit(root, cur_trans); - - put_transaction(cur_trans); - ret = 0; -out: - return ret; -} - -void btrfs_throttle(struct btrfs_root *root) -{ - if (!atomic_read(&root->fs_info->open_ioctl_trans)) - wait_current_trans(root); -} - -static int should_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); - return ret ? 1 : 0; -} - -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_block_rsv *rsv = trans->block_rsv; - int updates; - int err; - - smp_mb(); - if (cur_trans->blocked || cur_trans->delayed_refs.flushing) - return 1; - - /* - * We need to do this in case we're deleting csums so the global block - * rsv get's used instead of the csum block rsv. - */ - trans->block_rsv = NULL; - - updates = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (updates) { - err = btrfs_run_delayed_refs(trans, root, updates); - if (err) /* Error code will also eval true */ - return err; - } - - trans->block_rsv = rsv; - - return should_end_transaction(trans, root); -} - -static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int throttle, int lock) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_fs_info *info = root->fs_info; - int count = 0; - int err = 0; - - if (--trans->use_count) { - trans->block_rsv = trans->orig_rsv; - return 0; - } - - btrfs_trans_release_metadata(trans, root); - trans->block_rsv = NULL; - while (count < 2) { - unsigned long cur = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (cur && - trans->transaction->delayed_refs.num_heads_ready > 64) { - trans->delayed_ref_updates = 0; - btrfs_run_delayed_refs(trans, root, cur); - } else { - break; - } - count++; - } - - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && - should_end_transaction(trans, root)) { - trans->transaction->blocked = 1; - smp_wmb(); - } - - if (lock && cur_trans->blocked && !cur_trans->in_commit) { - if (throttle) { - /* - * We may race with somebody else here so end up having - * to call end_transaction on ourselves again, so inc - * our use_count. - */ - trans->use_count++; - return btrfs_commit_transaction(trans, root); - } else { - wake_up_process(info->transaction_kthread); - } - } - - WARN_ON(cur_trans != info->running_transaction); - WARN_ON(atomic_read(&cur_trans->num_writers) < 1); - atomic_dec(&cur_trans->num_writers); - - smp_mb(); - if (waitqueue_active(&cur_trans->writer_wait)) - wake_up(&cur_trans->writer_wait); - put_transaction(cur_trans); - - if (current->journal_info == trans) - current->journal_info = NULL; - - if (throttle) - btrfs_run_delayed_iputs(root); - - if (trans->aborted || - root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - err = -EIO; - } - - memset(trans, 0, sizeof(*trans)); - kmem_cache_free(btrfs_trans_handle_cachep, trans); - return err; -} - -int btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = __btrfs_end_transaction(trans, root, 0, 1); - if (ret) - return ret; - return 0; -} - -int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = __btrfs_end_transaction(trans, root, 1, 1); - if (ret) - return ret; - return 0; -} - -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = __btrfs_end_transaction(trans, root, 0, 0); - if (ret) - return ret; - return 0; -} - -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - return __btrfs_end_transaction(trans, root, 1, 1); -} - -/* - * when btree blocks are allocated, they have some corresponding bits set for - * them in one of two extent_io trees. This is used to make sure all of - * those extents are sent to disk but does not wait on them - */ -int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark) -{ - int err = 0; - int werr = 0; - struct address_space *mapping = root->fs_info->btree_inode->i_mapping; - u64 start = 0; - u64 end; - - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, - GFP_NOFS); - err = filemap_fdatawrite_range(mapping, start, end); - if (err) - werr = err; - cond_resched(); - start = end + 1; - } - if (err) - werr = err; - return werr; -} - -/* - * when btree blocks are allocated, they have some corresponding bits set for - * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit. We wait - * on all the pages and clear them from the dirty pages state tree - */ -int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark) -{ - int err = 0; - int werr = 0; - struct address_space *mapping = root->fs_info->btree_inode->i_mapping; - u64 start = 0; - u64 end; - - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT)) { - clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); - err = filemap_fdatawait_range(mapping, start, end); - if (err) - werr = err; - cond_resched(); - start = end + 1; - } - if (err) - werr = err; - return werr; -} - -/* - * when btree blocks are allocated, they have some corresponding bits set for - * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit - */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark) -{ - int ret; - int ret2; - - ret = btrfs_write_marked_extents(root, dirty_pages, mark); - ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); - - if (ret) - return ret; - if (ret2) - return ret2; - return 0; -} - -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!trans || !trans->transaction) { - struct inode *btree_inode; - btree_inode = root->fs_info->btree_inode; - return filemap_write_and_wait(btree_inode->i_mapping); - } - return btrfs_write_and_wait_marked_extents(root, - &trans->transaction->dirty_pages, - EXTENT_DIRTY); -} - -/* - * this is used to update the root pointer in the tree of tree roots. - * - * But, in the case of the extent allocation tree, updating the root - * pointer may allocate blocks which may change the root of the extent - * allocation tree. - * - * So, this loops and repeats and makes sure the cowonly root didn't - * change while the root pointer was being updated in the metadata. - */ -static int update_cowonly_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - u64 old_root_bytenr; - u64 old_root_used; - struct btrfs_root *tree_root = root->fs_info->tree_root; - - old_root_used = btrfs_root_used(&root->root_item); - btrfs_write_dirty_block_groups(trans, root); - - while (1) { - old_root_bytenr = btrfs_root_bytenr(&root->root_item); - if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item)) - break; - - btrfs_set_root_node(&root->root_item, root->node); - ret = btrfs_update_root(trans, tree_root, - &root->root_key, - &root->root_item); - if (ret) - return ret; - - old_root_used = btrfs_root_used(&root->root_item); - ret = btrfs_write_dirty_block_groups(trans, root); - if (ret) - return ret; - } - - if (root != root->fs_info->extent_root) - switch_commit_root(root); - - return 0; -} - -/* - * update all the cowonly tree roots on disk - * - * The error handling in this function may not be obvious. Any of the - * failures will cause the file system to go offline. We still need - * to clean up the delayed refs. - */ -static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct list_head *next; - struct extent_buffer *eb; - int ret; - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - - eb = btrfs_lock_root_node(fs_info->tree_root); - ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - - if (ret) - return ret; - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - - while (!list_empty(&fs_info->dirty_cowonly_roots)) { - next = fs_info->dirty_cowonly_roots.next; - list_del_init(next); - root = list_entry(next, struct btrfs_root, dirty_list); - - ret = update_cowonly_root(trans, root); - if (ret) - return ret; - } - - down_write(&fs_info->extent_commit_sem); - switch_commit_root(fs_info->extent_root); - up_write(&fs_info->extent_commit_sem); - - return 0; -} - -/* - * dead roots are old snapshots that need to be deleted. This allocates - * a dirty root struct and adds it into the list of dead roots that need to - * be deleted - */ -int btrfs_add_dead_root(struct btrfs_root *root) -{ - spin_lock(&root->fs_info->trans_lock); - list_add(&root->root_list, &root->fs_info->dead_roots); - spin_unlock(&root->fs_info->trans_lock); - return 0; -} - -/* - * update all the cowonly tree roots on disk - */ -static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_root *gang[8]; - struct btrfs_fs_info *fs_info = root->fs_info; - int i; - int ret; - int err = 0; - - spin_lock(&fs_info->fs_roots_radix_lock); - while (1) { - ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang), - BTRFS_ROOT_TRANS_TAG); - if (ret == 0) - break; - for (i = 0; i < ret; i++) { - root = gang[i]; - radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&fs_info->fs_roots_radix_lock); - - btrfs_free_log(trans, root); - btrfs_update_reloc_root(trans, root); - btrfs_orphan_commit_root(trans, root); - - btrfs_save_ino_cache(root, trans); - - /* see comments in should_cow_block() */ - root->force_cow = 0; - smp_wmb(); - - if (root->commit_root != root->node) { - mutex_lock(&root->fs_commit_mutex); - switch_commit_root(root); - btrfs_unpin_free_ino(root); - mutex_unlock(&root->fs_commit_mutex); - - btrfs_set_root_node(&root->root_item, - root->node); - } - - err = btrfs_update_root(trans, fs_info->tree_root, - &root->root_key, - &root->root_item); - spin_lock(&fs_info->fs_roots_radix_lock); - if (err) - break; - } - } - spin_unlock(&fs_info->fs_roots_radix_lock); - return err; -} - -/* - * defrag a given btree. If cacheonly == 1, this won't read from the disk, - * otherwise every leaf in the btree is read and defragged. - */ -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_trans_handle *trans; - int ret; - unsigned long nr; - - if (xchg(&root->defrag_running, 1)) - return 0; - - while (1) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_defrag_leaves(trans, root, cacheonly); - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(info->tree_root, nr); - cond_resched(); - - if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) - break; - } - root->defrag_running = 0; - return ret; -} - -/* - * new snapshots need to be created at a very specific time in the - * transaction commit. This does the actual creation - */ -static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_key key; - struct btrfs_root_item *new_root_item; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *root = pending->root; - struct btrfs_root *parent_root; - struct btrfs_block_rsv *rsv; - struct inode *parent_inode; - struct dentry *parent; - struct dentry *dentry; - struct extent_buffer *tmp; - struct extent_buffer *old; - int ret; - u64 to_reserve = 0; - u64 index = 0; - u64 objectid; - u64 root_flags; - - rsv = trans->block_rsv; - - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); - if (!new_root_item) { - ret = pending->error = -ENOMEM; - goto fail; - } - - ret = btrfs_find_free_objectid(tree_root, &objectid); - if (ret) { - pending->error = ret; - goto fail; - } - - btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); - - if (to_reserve > 0) { - ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, - to_reserve); - if (ret) { - pending->error = ret; - goto fail; - } - } - - key.objectid = objectid; - key.offset = (u64)-1; - key.type = BTRFS_ROOT_ITEM_KEY; - - trans->block_rsv = &pending->block_rsv; - - dentry = pending->dentry; - parent = dget_parent(dentry); - parent_inode = parent->d_inode; - parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(parent_inode, &index); - BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_insert_dir_item(trans, parent_root, - dentry->d_name.name, dentry->d_name.len, - parent_inode, &key, - BTRFS_FT_DIR, index); - if (ret == -EEXIST) { - pending->error = -EEXIST; - dput(parent); - goto fail; - } else if (ret) { - goto abort_trans_dput; - } - - btrfs_i_size_write(parent_inode, parent_inode->i_size + - dentry->d_name.len * 2); - ret = btrfs_update_inode(trans, parent_root, parent_inode); - if (ret) - goto abort_trans_dput; - - /* - * pull in the delayed directory update - * and the delayed inode item - * otherwise we corrupt the FS during - * snapshot - */ - ret = btrfs_run_delayed_items(trans, root); - if (ret) { /* Transaction aborted */ - dput(parent); - goto fail; - } - - record_root_in_trans(trans, root); - btrfs_set_root_last_snapshot(&root->root_item, trans->transid); - memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); - btrfs_check_and_init_root_item(new_root_item); - - root_flags = btrfs_root_flags(new_root_item); - if (pending->readonly) - root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; - else - root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; - btrfs_set_root_flags(new_root_item, root_flags); - - old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); - if (ret) { - btrfs_tree_unlock(old); - free_extent_buffer(old); - goto abort_trans_dput; - } - - btrfs_set_lock_blocking(old); - - ret = btrfs_copy_root(trans, root, old, &tmp, objectid); - /* clean up in any case */ - btrfs_tree_unlock(old); - free_extent_buffer(old); - if (ret) - goto abort_trans_dput; - - /* see comments in should_cow_block() */ - root->force_cow = 1; - smp_wmb(); - - btrfs_set_root_node(new_root_item, tmp); - /* record when the snapshot was created in key.offset */ - key.offset = trans->transid; - ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); - btrfs_tree_unlock(tmp); - free_extent_buffer(tmp); - if (ret) - goto abort_trans_dput; - - /* - * insert root back/forward references - */ - ret = btrfs_add_root_ref(trans, tree_root, objectid, - parent_root->root_key.objectid, - btrfs_ino(parent_inode), index, - dentry->d_name.name, dentry->d_name.len); - dput(parent); - if (ret) - goto fail; - - key.offset = (u64)-1; - pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); - if (IS_ERR(pending->snap)) { - ret = PTR_ERR(pending->snap); - goto abort_trans; - } - - ret = btrfs_reloc_post_snapshot(trans, pending); - if (ret) - goto abort_trans; - ret = 0; -fail: - kfree(new_root_item); - trans->block_rsv = rsv; - btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); - return ret; - -abort_trans_dput: - dput(parent); -abort_trans: - btrfs_abort_transaction(trans, root, ret); - goto fail; -} - -/* - * create all the snapshots we've scheduled for creation - */ -static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_pending_snapshot *pending; - struct list_head *head = &trans->transaction->pending_snapshots; - - list_for_each_entry(pending, head, list) - create_pending_snapshot(trans, fs_info, pending); - return 0; -} - -static void update_super_roots(struct btrfs_root *root) -{ - struct btrfs_root_item *root_item; - struct btrfs_super_block *super; - - super = root->fs_info->super_copy; - - root_item = &root->fs_info->chunk_root->root_item; - super->chunk_root = root_item->bytenr; - super->chunk_root_generation = root_item->generation; - super->chunk_root_level = root_item->level; - - root_item = &root->fs_info->tree_root->root_item; - super->root = root_item->bytenr; - super->generation = root_item->generation; - super->root_level = root_item->level; - if (btrfs_test_opt(root, SPACE_CACHE)) - super->cache_generation = root_item->generation; -} - -int btrfs_transaction_in_commit(struct btrfs_fs_info *info) -{ - int ret = 0; - spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->in_commit; - spin_unlock(&info->trans_lock); - return ret; -} - -int btrfs_transaction_blocked(struct btrfs_fs_info *info) -{ - int ret = 0; - spin_lock(&info->trans_lock); - if (info->running_transaction) - ret = info->running_transaction->blocked; - spin_unlock(&info->trans_lock); - return ret; -} - -/* - * wait for the current transaction commit to start and block subsequent - * transaction joins - */ -static void wait_current_trans_commit_start(struct btrfs_root *root, - struct btrfs_transaction *trans) -{ - wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); -} - -/* - * wait for the current transaction to start and then become unblocked. - * caller holds ref. - */ -static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, - struct btrfs_transaction *trans) -{ - wait_event(root->fs_info->transaction_wait, - trans->commit_done || (trans->in_commit && !trans->blocked)); -} - -/* - * commit transactions asynchronously. once btrfs_commit_transaction_async - * returns, any subsequent transaction will not be allowed to join. - */ -struct btrfs_async_commit { - struct btrfs_trans_handle *newtrans; - struct btrfs_root *root; - struct delayed_work work; -}; - -static void do_async_commit(struct work_struct *work) -{ - struct btrfs_async_commit *ac = - container_of(work, struct btrfs_async_commit, work.work); - - btrfs_commit_transaction(ac->newtrans, ac->root); - kfree(ac); -} - -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int wait_for_unblock) -{ - struct btrfs_async_commit *ac; - struct btrfs_transaction *cur_trans; - - ac = kmalloc(sizeof(*ac), GFP_NOFS); - if (!ac) - return -ENOMEM; - - INIT_DELAYED_WORK(&ac->work, do_async_commit); - ac->root = root; - ac->newtrans = btrfs_join_transaction(root); - if (IS_ERR(ac->newtrans)) { - int err = PTR_ERR(ac->newtrans); - kfree(ac); - return err; - } - - /* take transaction reference */ - cur_trans = trans->transaction; - atomic_inc(&cur_trans->use_count); - - btrfs_end_transaction(trans, root); - schedule_delayed_work(&ac->work, 0); - - /* wait for transaction to start and unblock */ - if (wait_for_unblock) - wait_current_trans_commit_start_and_unblock(root, cur_trans); - else - wait_current_trans_commit_start(root, cur_trans); - - if (current->journal_info == trans) - current->journal_info = NULL; - - put_transaction(cur_trans); - return 0; -} - - -static void cleanup_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - - WARN_ON(trans->use_count > 1); - - spin_lock(&root->fs_info->trans_lock); - list_del_init(&cur_trans->list); - spin_unlock(&root->fs_info->trans_lock); - - btrfs_cleanup_one_transaction(trans->transaction, root); - - put_transaction(cur_trans); - put_transaction(cur_trans); - - trace_btrfs_transaction_commit(root); - - btrfs_scrub_continue(root); - - if (current->journal_info == trans) - current->journal_info = NULL; - - kmem_cache_free(btrfs_trans_handle_cachep, trans); -} - -/* - * btrfs_transaction state sequence: - * in_commit = 0, blocked = 0 (initial) - * in_commit = 1, blocked = 1 - * blocked = 0 - * commit_done = 1 - */ -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - unsigned long joined = 0; - struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_transaction *prev_trans = NULL; - DEFINE_WAIT(wait); - int ret = -EIO; - int should_grow = 0; - unsigned long now = get_seconds(); - int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - - btrfs_run_ordered_operations(root, 0); - - btrfs_trans_release_metadata(trans, root); - trans->block_rsv = NULL; - - if (cur_trans->aborted) - goto cleanup_transaction; - - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; - - cur_trans = trans->transaction; - - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. - */ - cur_trans->delayed_refs.flushing = 1; - - ret = btrfs_run_delayed_refs(trans, root, 0); - if (ret) - goto cleanup_transaction; - - spin_lock(&cur_trans->commit_lock); - if (cur_trans->in_commit) { - spin_unlock(&cur_trans->commit_lock); - atomic_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans, root); - - wait_for_commit(root, cur_trans); - - put_transaction(cur_trans); - - return ret; - } - - trans->transaction->in_commit = 1; - trans->transaction->blocked = 1; - spin_unlock(&cur_trans->commit_lock); - wake_up(&root->fs_info->transaction_blocked_wait); - - spin_lock(&root->fs_info->trans_lock); - if (cur_trans->list.prev != &root->fs_info->trans_list) { - prev_trans = list_entry(cur_trans->list.prev, - struct btrfs_transaction, list); - if (!prev_trans->commit_done) { - atomic_inc(&prev_trans->use_count); - spin_unlock(&root->fs_info->trans_lock); - - wait_for_commit(root, prev_trans); - - put_transaction(prev_trans); - } else { - spin_unlock(&root->fs_info->trans_lock); - } - } else { - spin_unlock(&root->fs_info->trans_lock); - } - - if (now < cur_trans->start_time || now - cur_trans->start_time < 1) - should_grow = 1; - - do { - int snap_pending = 0; - - joined = cur_trans->num_joined; - if (!list_empty(&trans->transaction->pending_snapshots)) - snap_pending = 1; - - WARN_ON(cur_trans != trans->transaction); - - if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); - btrfs_wait_ordered_extents(root, 0, 1); - } - - ret = btrfs_run_delayed_items(trans, root); - if (ret) - goto cleanup_transaction; - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - btrfs_run_ordered_operations(root, 1); - - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); - - if (atomic_read(&cur_trans->num_writers) > 1) - schedule_timeout(MAX_SCHEDULE_TIMEOUT); - else if (should_grow) - schedule_timeout(1); - - finish_wait(&cur_trans->writer_wait, &wait); - } while (atomic_read(&cur_trans->num_writers) > 1 || - (should_grow && cur_trans->num_joined != joined)); - - /* - * Ok now we need to make sure to block out any other joins while we - * commit the transaction. We could have started a join before setting - * no_join so make sure to wait for num_writers to == 1 again. - */ - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); - wait_event(cur_trans->writer_wait, - atomic_read(&cur_trans->num_writers) == 1); - - /* - * the reloc mutex makes sure that we stop - * the balancing code from coming in and moving - * extents around in the middle of the commit - */ - mutex_lock(&root->fs_info->reloc_mutex); - - ret = btrfs_run_delayed_items(trans, root); - if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; - } - - ret = create_pending_snapshots(trans, root->fs_info); - if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; - } - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; - } - - /* - * make sure none of the code above managed to slip in a - * delayed item - */ - btrfs_assert_delayed_root_empty(root); - - WARN_ON(cur_trans != trans->transaction); - - btrfs_scrub_pause(root); - /* btrfs_commit_tree_roots is responsible for getting the - * various roots consistent with each other. Every pointer - * in the tree of tree roots has to point to the most up to date - * root for every subvolume and other tree. So, we have to keep - * the tree logging code from jumping in and changing any - * of the trees. - * - * At this point in the commit, there can't be any tree-log - * writers, but a little lower down we drop the trans mutex - * and let new people in. By holding the tree_log_mutex - * from now until after the super is written, we avoid races - * with the tree-log code. - */ - mutex_lock(&root->fs_info->tree_log_mutex); - - ret = commit_fs_roots(trans, root); - if (ret) { - mutex_unlock(&root->fs_info->tree_log_mutex); - mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; - } - - /* commit_fs_roots gets rid of all the tree log roots, it is now - * safe to free the root of tree log roots - */ - btrfs_free_log_root_tree(trans, root->fs_info); - - ret = commit_cowonly_roots(trans, root); - if (ret) { - mutex_unlock(&root->fs_info->tree_log_mutex); - mutex_unlock(&root->fs_info->reloc_mutex); - goto cleanup_transaction; - } - - btrfs_prepare_extent_commit(trans, root); - - cur_trans = root->fs_info->running_transaction; - - btrfs_set_root_node(&root->fs_info->tree_root->root_item, - root->fs_info->tree_root->node); - switch_commit_root(root->fs_info->tree_root); - - btrfs_set_root_node(&root->fs_info->chunk_root->root_item, - root->fs_info->chunk_root->node); - switch_commit_root(root->fs_info->chunk_root); - - update_super_roots(root); - - if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); - } - - memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, - sizeof(*root->fs_info->super_copy)); - - trans->transaction->blocked = 0; - spin_lock(&root->fs_info->trans_lock); - root->fs_info->running_transaction = NULL; - root->fs_info->trans_no_join = 0; - spin_unlock(&root->fs_info->trans_lock); - mutex_unlock(&root->fs_info->reloc_mutex); - - wake_up(&root->fs_info->transaction_wait); - - ret = btrfs_write_and_wait_transaction(trans, root); - if (ret) { - btrfs_error(root->fs_info, ret, - "Error while writing out transaction."); - mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; - } - - ret = write_ctree_super(trans, root, 0); - if (ret) { - mutex_unlock(&root->fs_info->tree_log_mutex); - goto cleanup_transaction; - } - - /* - * the super is written, we can safely allow the tree-loggers - * to go about their business - */ - mutex_unlock(&root->fs_info->tree_log_mutex); - - btrfs_finish_extent_commit(trans, root); - - cur_trans->commit_done = 1; - - root->fs_info->last_trans_committed = cur_trans->transid; - - wake_up(&cur_trans->commit_wait); - - spin_lock(&root->fs_info->trans_lock); - list_del_init(&cur_trans->list); - spin_unlock(&root->fs_info->trans_lock); - - put_transaction(cur_trans); - put_transaction(cur_trans); - - trace_btrfs_transaction_commit(root); - - btrfs_scrub_continue(root); - - if (current->journal_info == trans) - current->journal_info = NULL; - - kmem_cache_free(btrfs_trans_handle_cachep, trans); - - if (current != root->fs_info->transaction_kthread) - btrfs_run_delayed_iputs(root); - - return ret; - -cleanup_transaction: - btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); -// WARN_ON(1); - if (current->journal_info == trans) - current->journal_info = NULL; - cleanup_transaction(trans, root); - - return ret; -} - -/* - * interface function to delete all the snapshots we have scheduled for deletion - */ -int btrfs_clean_old_snapshots(struct btrfs_root *root) -{ - LIST_HEAD(list); - struct btrfs_fs_info *fs_info = root->fs_info; - - spin_lock(&fs_info->trans_lock); - list_splice_init(&fs_info->dead_roots, &list); - spin_unlock(&fs_info->trans_lock); - - while (!list_empty(&list)) { - int ret; - - root = list_entry(list.next, struct btrfs_root, root_list); - list_del(&root->root_list); - - btrfs_kill_all_delayed_nodes(root); - - if (btrfs_header_backref_rev(root->node) < - BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); - else - ret =btrfs_drop_snapshot(root, NULL, 1, 0); - BUG_ON(ret < 0); - } - return 0; -} diff --git a/ANDROID_3.4.5/fs/btrfs/transaction.h b/ANDROID_3.4.5/fs/btrfs/transaction.h deleted file mode 100644 index fe27379e..00000000 --- a/ANDROID_3.4.5/fs/btrfs/transaction.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_TRANSACTION__ -#define __BTRFS_TRANSACTION__ -#include "btrfs_inode.h" -#include "delayed-ref.h" - -struct btrfs_transaction { - u64 transid; - /* - * total writers in this transaction, it must be zero before the - * transaction can end - */ - atomic_t num_writers; - atomic_t use_count; - - unsigned long num_joined; - - spinlock_t commit_lock; - int in_commit; - int commit_done; - int blocked; - struct list_head list; - struct extent_io_tree dirty_pages; - unsigned long start_time; - wait_queue_head_t writer_wait; - wait_queue_head_t commit_wait; - struct list_head pending_snapshots; - struct btrfs_delayed_ref_root delayed_refs; - int aborted; -}; - -struct btrfs_trans_handle { - u64 transid; - u64 bytes_reserved; - unsigned long use_count; - unsigned long blocks_reserved; - unsigned long blocks_used; - unsigned long delayed_ref_updates; - struct btrfs_transaction *transaction; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *orig_rsv; - int aborted; -}; - -struct btrfs_pending_snapshot { - struct dentry *dentry; - struct btrfs_root *root; - struct btrfs_root *snap; - /* block reservation for the operation */ - struct btrfs_block_rsv block_rsv; - /* extra metadata reseration for relocation */ - int error; - bool readonly; - struct list_head list; -}; - -static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - BTRFS_I(inode)->last_trans = trans->transaction->transid; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; -} - -int btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items); -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); -int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - -int btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); -int btrfs_clean_old_snapshots(struct btrfs_root *root); -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - int wait_for_unblock); -int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -void btrfs_throttle(struct btrfs_root *root); -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); -int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); -int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); -int btrfs_transaction_blocked(struct btrfs_fs_info *info); -int btrfs_transaction_in_commit(struct btrfs_fs_info *info); -void put_transaction(struct btrfs_transaction *transaction); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/tree-defrag.c b/ANDROID_3.4.5/fs/btrfs/tree-defrag.c deleted file mode 100644 index 3b580ee8..00000000 --- a/ANDROID_3.4.5/fs/btrfs/tree-defrag.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include "ctree.h" -#include "disk-io.h" -#include "print-tree.h" -#include "transaction.h" -#include "locking.h" - -/* defrag all the leaves in a given btree. If cache_only == 1, don't read - * things from disk, otherwise read all the leaves and try to get key order to - * better reflect disk order - */ - -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int cache_only) -{ - struct btrfs_path *path = NULL; - struct btrfs_key key; - int ret = 0; - int wret; - int level; - int is_extent = 0; - int next_key_ret = 0; - u64 last_ret = 0; - u64 min_trans = 0; - - if (cache_only) - goto out; - - if (root->fs_info->extent_root == root) { - /* - * there's recursion here right now in the tree locking, - * we can't defrag the extent root without deadlock - */ - goto out; - } - - if (root->ref_cows == 0 && !is_extent) - goto out; - - if (btrfs_test_opt(root, SSD)) - goto out; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - level = btrfs_header_level(root->node); - - if (level == 0) - goto out; - - if (root->defrag_progress.objectid == 0) { - struct extent_buffer *root_node; - u32 nritems; - - root_node = btrfs_lock_root_node(root); - btrfs_set_lock_blocking(root_node); - nritems = btrfs_header_nritems(root_node); - root->defrag_max.objectid = 0; - /* from above we know this is not a leaf */ - btrfs_node_key_to_cpu(root_node, &root->defrag_max, - nritems - 1); - btrfs_tree_unlock(root_node); - free_extent_buffer(root_node); - memset(&key, 0, sizeof(key)); - } else { - memcpy(&key, &root->defrag_progress, sizeof(key)); - } - - path->keep_locks = 1; - if (cache_only) - min_trans = root->defrag_trans_start; - - ret = btrfs_search_forward(root, &key, NULL, path, - cache_only, min_trans); - if (ret < 0) - goto out; - if (ret > 0) { - ret = 0; - goto out; - } - btrfs_release_path(path); - wret = btrfs_search_slot(trans, root, &key, path, 0, 1); - - if (wret < 0) { - ret = wret; - goto out; - } - if (!path->nodes[1]) { - ret = 0; - goto out; - } - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, - min_trans); - ret = btrfs_realloc_node(trans, root, - path->nodes[1], 0, - cache_only, &last_ret, - &root->defrag_progress); - if (ret) { - WARN_ON(ret == -EAGAIN); - goto out; - } - if (next_key_ret == 0) { - memcpy(&root->defrag_progress, &key, sizeof(key)); - ret = -EAGAIN; - } -out: - if (path) - btrfs_free_path(path); - if (ret == -EAGAIN) { - if (root->defrag_max.objectid > root->defrag_progress.objectid) - goto done; - if (root->defrag_max.type > root->defrag_progress.type) - goto done; - if (root->defrag_max.offset > root->defrag_progress.offset) - goto done; - ret = 0; - } -done: - if (ret != -EAGAIN) { - memset(&root->defrag_progress, 0, - sizeof(root->defrag_progress)); - root->defrag_trans_start = trans->transid; - } - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/tree-log.c b/ANDROID_3.4.5/fs/btrfs/tree-log.c deleted file mode 100644 index dce89da9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/tree-log.c +++ /dev/null @@ -1,3398 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include "ctree.h" -#include "transaction.h" -#include "disk-io.h" -#include "locking.h" -#include "print-tree.h" -#include "compat.h" -#include "tree-log.h" - -/* magic values for the inode_only field in btrfs_log_inode: - * - * LOG_INODE_ALL means to log everything - * LOG_INODE_EXISTS means to log just enough to recreate the inode - * during log replay - */ -#define LOG_INODE_ALL 0 -#define LOG_INODE_EXISTS 1 - -/* - * directory trouble cases - * - * 1) on rename or unlink, if the inode being unlinked isn't in the fsync - * log, we must force a full commit before doing an fsync of the directory - * where the unlink was done. - * ---> record transid of last unlink/rename per directory - * - * mkdir foo/some_dir - * normal commit - * rename foo/some_dir foo2/some_dir - * mkdir foo/some_dir - * fsync foo/some_dir/some_file - * - * The fsync above will unlink the original some_dir without recording - * it in its new location (foo2). After a crash, some_dir will be gone - * unless the fsync of some_file forces a full commit - * - * 2) we must log any new names for any file or dir that is in the fsync - * log. ---> check inode while renaming/linking. - * - * 2a) we must log any new names for any file or dir during rename - * when the directory they are being removed from was logged. - * ---> check inode and old parent dir during rename - * - * 2a is actually the more important variant. With the extra logging - * a crash might unlink the old name without recreating the new one - * - * 3) after a crash, we must go through any directories with a link count - * of zero and redo the rm -rf - * - * mkdir f1/foo - * normal commit - * rm -rf f1/foo - * fsync(f1) - * - * The directory f1 was fully removed from the FS, but fsync was never - * called on f1, only its parent dir. After a crash the rm -rf must - * be replayed. This must be able to recurse down the entire - * directory tree. The inode link count fixup code takes care of the - * ugly details. - */ - -/* - * stages for the tree walking. The first - * stage (0) is to only pin down the blocks we find - * the second stage (1) is to make sure that all the inodes - * we find in the log are created in the subvolume. - * - * The last stage is to deal with directories and links and extents - * and all the other fun semantics - */ -#define LOG_WALK_PIN_ONLY 0 -#define LOG_WALK_REPLAY_INODES 1 -#define LOG_WALK_REPLAY_ALL 2 - -static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); -static int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid); -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - u64 dirid, int del_all); - -/* - * tree logging is a special write ahead log used to make sure that - * fsyncs and O_SYNCs can happen without doing full tree commits. - * - * Full tree commits are expensive because they require commonly - * modified blocks to be recowed, creating many dirty pages in the - * extent tree an 4x-6x higher write load than ext3. - * - * Instead of doing a tree commit on every fsync, we use the - * key ranges and transaction ids to find items for a given file or directory - * that have changed in this transaction. Those items are copied into - * a special tree (one per subvolume root), that tree is written to disk - * and then the fsync is considered complete. - * - * After a crash, items are copied out of the log-tree back into the - * subvolume tree. Any file data extents found are recorded in the extent - * allocation tree, and the log-tree freed. - * - * The log tree is read three times, once to pin down all the extents it is - * using in ram and once, once to create all the inodes logged in the tree - * and once to do all the other items. - */ - -/* - * start a sub transaction and setup the log tree - * this increments the log tree writer count to make the people - * syncing the tree wait for us to finish - */ -static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - int err = 0; - - mutex_lock(&root->log_mutex); - if (root->log_root) { - if (!root->log_start_pid) { - root->log_start_pid = current->pid; - root->log_multiple_pids = false; - } else if (root->log_start_pid != current->pid) { - root->log_multiple_pids = true; - } - - root->log_batch++; - atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); - return 0; - } - root->log_multiple_pids = false; - root->log_start_pid = current->pid; - mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) { - ret = btrfs_init_log_root_tree(trans, root->fs_info); - if (ret) - err = ret; - } - if (err == 0 && !root->log_root) { - ret = btrfs_add_log_tree(trans, root); - if (ret) - err = ret; - } - mutex_unlock(&root->fs_info->tree_log_mutex); - root->log_batch++; - atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); - return err; -} - -/* - * returns 0 if there was a log transaction running and we were able - * to join, or returns -ENOENT if there were not transactions - * in progress - */ -static int join_running_log_trans(struct btrfs_root *root) -{ - int ret = -ENOENT; - - smp_mb(); - if (!root->log_root) - return -ENOENT; - - mutex_lock(&root->log_mutex); - if (root->log_root) { - ret = 0; - atomic_inc(&root->log_writers); - } - mutex_unlock(&root->log_mutex); - return ret; -} - -/* - * This either makes the current running log transaction wait - * until you call btrfs_end_log_trans() or it makes any future - * log transactions wait until you call btrfs_end_log_trans() - */ -int btrfs_pin_log_trans(struct btrfs_root *root) -{ - int ret = -ENOENT; - - mutex_lock(&root->log_mutex); - atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); - return ret; -} - -/* - * indicate we're done making changes to the log tree - * and wake up anyone waiting to do a sync - */ -void btrfs_end_log_trans(struct btrfs_root *root) -{ - if (atomic_dec_and_test(&root->log_writers)) { - smp_mb(); - if (waitqueue_active(&root->log_writer_wait)) - wake_up(&root->log_writer_wait); - } -} - - -/* - * the walk control struct is used to pass state down the chain when - * processing the log tree. The stage field tells us which part - * of the log tree processing we are currently doing. The others - * are state fields used for that specific part - */ -struct walk_control { - /* should we free the extent on disk when done? This is used - * at transaction commit time while freeing a log tree - */ - int free; - - /* should we write out the extent buffer? This is used - * while flushing the log tree to disk during a sync - */ - int write; - - /* should we wait for the extent buffer io to finish? Also used - * while flushing the log tree to disk for a sync - */ - int wait; - - /* pin only walk, we record which extents on disk belong to the - * log trees - */ - int pin; - - /* what stage of the replay code we're currently in */ - int stage; - - /* the root we are currently replaying */ - struct btrfs_root *replay_dest; - - /* the trans handle for the current replay */ - struct btrfs_trans_handle *trans; - - /* the function that gets used to process blocks we find in the - * tree. Note the extent_buffer might not be up to date when it is - * passed in, and it must be checked or read if you need the data - * inside it - */ - int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen); -}; - -/* - * process_func used to pin down extents, write them or wait on them - */ -static int process_one_buffer(struct btrfs_root *log, - struct extent_buffer *eb, - struct walk_control *wc, u64 gen) -{ - if (wc->pin) - btrfs_pin_extent_for_log_replay(wc->trans, - log->fs_info->extent_root, - eb->start, eb->len); - - if (btrfs_buffer_uptodate(eb, gen, 0)) { - if (wc->write) - btrfs_write_tree_block(eb); - if (wc->wait) - btrfs_wait_tree_block_writeback(eb); - } - return 0; -} - -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static noinline int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) -{ - int ret; - u32 item_size; - u64 saved_i_size = 0; - int save_old_i_size = 0; - unsigned long src_ptr; - unsigned long dst_ptr; - int overwrite_root = 0; - - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - overwrite_root = 1; - - item_size = btrfs_item_size_nr(eb, slot); - src_ptr = btrfs_item_ptr_offset(eb, slot); - - /* look for the key in the destination tree */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret == 0) { - char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - if (dst_size != item_size) - goto insert; - - if (item_size == 0) { - btrfs_release_path(path); - return 0; - } - dst_copy = kmalloc(item_size, GFP_NOFS); - src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { - btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); - return -ENOMEM; - } - - read_extent_buffer(eb, src_copy, src_ptr, item_size); - - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); - kfree(src_copy); - /* - * they have the same contents, just return, this saves - * us from cowing blocks in the destination tree and doing - * extra writes that may not have been done by a previous - * sync - */ - if (ret == 0) { - btrfs_release_path(path); - return 0; - } - - } -insert: - btrfs_release_path(path); - /* try to insert the key into the destination tree */ - ret = btrfs_insert_empty_item(trans, root, path, - key, item_size); - - /* make sure any existing item is the correct size */ - if (ret == -EEXIST) { - u32 found_size; - found_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - if (found_size > item_size) - btrfs_truncate_item(trans, root, path, item_size, 1); - else if (found_size < item_size) - btrfs_extend_item(trans, root, path, - item_size - found_size); - } else if (ret) { - return ret; - } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); - - /* don't overwrite an existing inode if the generation number - * was logged as zero. This is done when the tree logging code - * is just logging an inode to make sure it exists after recovery. - * - * Also, don't overwrite i_size on directories during replay. - * log replay inserts and removes directory items based on the - * state of the tree found in the subvolume, and i_size is modified - * as it goes - */ - if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { - struct btrfs_inode_item *src_item; - struct btrfs_inode_item *dst_item; - - src_item = (struct btrfs_inode_item *)src_ptr; - dst_item = (struct btrfs_inode_item *)dst_ptr; - - if (btrfs_inode_generation(eb, src_item) == 0) - goto no_copy; - - if (overwrite_root && - S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { - save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); - } - } - - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); - - if (save_old_i_size) { - struct btrfs_inode_item *dst_item; - dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); - } - - /* make sure the generation is filled in */ - if (key->type == BTRFS_INODE_ITEM_KEY) { - struct btrfs_inode_item *dst_item; - dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } - } -no_copy: - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); - return 0; -} - -/* - * simple helper to read an inode off the disk from a given root - * This can only be called for subvolume roots and not for the log - */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) -{ - struct btrfs_key key; - struct inode *inode; - - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); - if (IS_ERR(inode)) { - inode = NULL; - } else if (is_bad_inode(inode)) { - iput(inode); - inode = NULL; - } - return inode; -} - -/* replays a single extent in 'eb' at 'slot' with 'key' into the - * subvolume 'root'. path is released on entry and should be released - * on exit. - * - * extents in the log tree have not been allocated out of the extent - * tree yet. So, this completes the allocation, taking a reference - * as required if the extent already exists or creating a new extent - * if it isn't in the extent allocation tree yet. - * - * The extent is inserted into the file, dropping any existing extents - * from the file that overlap the new one. - */ -static noinline int replay_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) -{ - int found_type; - u64 mask = root->sectorsize - 1; - u64 extent_end; - u64 alloc_hint; - u64 start = key->offset; - u64 saved_nbytes; - struct btrfs_file_extent_item *item; - struct inode *inode = NULL; - unsigned long size; - int ret = 0; - - item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(eb, item); - - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) - extent_end = start + btrfs_file_extent_num_bytes(eb, item); - else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, item); - extent_end = (start + size + mask) & ~mask; - } else { - ret = 0; - goto out; - } - - inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } - - /* - * first check to see if we already have this extent in the - * file. This must be done before the btrfs_drop_extents run - * so we don't try to drop this extent. - */ - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), - start, 0); - - if (ret == 0 && - (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); - - /* - * we already have a pointer to this exact extent, - * we don't have to do anything - */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { - btrfs_release_path(path); - goto out; - } - } - btrfs_release_path(path); - - saved_nbytes = inode_get_bytes(inode); - /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, inode, start, extent_end, - &alloc_hint, 1); - BUG_ON(ret); - - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 offset; - unsigned long dest_offset; - struct btrfs_key ins; - - ret = btrfs_insert_empty_item(trans, root, path, key, - sizeof(*item)); - BUG_ON(ret); - dest_offset = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); - copy_extent_buffer(path->nodes[0], eb, dest_offset, - (unsigned long)item, sizeof(*item)); - - ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); - ins.type = BTRFS_EXTENT_ITEM_KEY; - offset = key->offset - btrfs_file_extent_offset(eb, item); - - if (ins.objectid > 0) { - u64 csum_start; - u64 csum_end; - LIST_HEAD(ordered_sums); - /* - * is this extent already allocated in the extent - * allocation tree? If so, just add a reference - */ - ret = btrfs_lookup_extent(root, ins.objectid, - ins.offset); - if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, root, - ins.objectid, ins.offset, - 0, root->root_key.objectid, - key->objectid, offset, 0); - BUG_ON(ret); - } else { - /* - * insert the extent pointer in the extent - * allocation tree - */ - ret = btrfs_alloc_logged_file_extent(trans, - root, root->root_key.objectid, - key->objectid, offset, &ins); - BUG_ON(ret); - } - btrfs_release_path(path); - - if (btrfs_file_extent_compression(eb, item)) { - csum_start = ins.objectid; - csum_end = csum_start + ins.offset; - } else { - csum_start = ins.objectid + - btrfs_file_extent_offset(eb, item); - csum_end = csum_start + - btrfs_file_extent_num_bytes(eb, item); - } - - ret = btrfs_lookup_csums_range(root->log_root, - csum_start, csum_end - 1, - &ordered_sums, 0); - BUG_ON(ret); - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums; - sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); - ret = btrfs_csum_file_blocks(trans, - root->fs_info->csum_root, - sums); - BUG_ON(ret); - list_del(&sums->list); - kfree(sums); - } - } else { - btrfs_release_path(path); - } - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - /* inline extents are easy, we just overwrite them */ - ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); - } - - inode_set_bytes(inode, saved_nbytes); - btrfs_update_inode(trans, root, inode); -out: - if (inode) - iput(inode); - return ret; -} - -/* - * when cleaning up conflicts between the directory names in the - * subvolume, directory names in the log and directory names in the - * inode back references, we may have to unlink inodes from directories. - * - * This is a helper function to do the unlink of a specific directory - * item - */ -static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct inode *dir, - struct btrfs_dir_item *di) -{ - struct inode *inode; - char *name; - int name_len; - struct extent_buffer *leaf; - struct btrfs_key location; - int ret; - - leaf = path->nodes[0]; - - btrfs_dir_item_key_to_cpu(leaf, di, &location); - name_len = btrfs_dir_name_len(leaf, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) - return -ENOMEM; - - read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); - btrfs_release_path(path); - - inode = read_one_inode(root, location.objectid); - if (!inode) { - kfree(name); - return -EIO; - } - - ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); - - ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); - kfree(name); - - iput(inode); - - btrfs_run_delayed_items(trans, root); - return ret; -} - -/* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode - */ -static noinline int inode_in_dir(struct btrfs_root *root, - struct btrfs_path *path, - u64 dirid, u64 objectid, u64 index, - const char *name, int name_len) -{ - struct btrfs_dir_item *di; - struct btrfs_key location; - int match = 0; - - di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, - index, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else - goto out; - btrfs_release_path(path); - - di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else - goto out; - match = 1; -out: - btrfs_release_path(path); - return match; -} - -/* - * helper function to check a log tree for a named back reference in - * an inode. This is used to decide if a back reference that is - * found in the subvolume conflicts with what we find in the log. - * - * inode backreferences may have multiple refs in a single item, - * during replay we process one reference at a time, and we don't - * want to delete valid links to a file from the subvolume if that - * link is also in the log. - */ -static noinline int backref_in_log(struct btrfs_root *log, - struct btrfs_key *key, - char *name, int namelen) -{ - struct btrfs_path *path; - struct btrfs_inode_ref *ref; - unsigned long ptr; - unsigned long ptr_end; - unsigned long name_ptr; - int found_name_len; - int item_size; - int ret; - int match = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, log, key, path, 0, 0); - if (ret != 0) - goto out; - - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - ref = (struct btrfs_inode_ref *)ptr; - found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); - if (found_name_len == namelen) { - name_ptr = (unsigned long)(ref + 1); - ret = memcmp_extent_buffer(path->nodes[0], name, - name_ptr, namelen); - if (ret == 0) { - match = 1; - goto out; - } - } - ptr = (unsigned long)(ref + 1) + found_name_len; - } -out: - btrfs_free_path(path); - return match; -} - - -/* - * replay one inode back reference item found in the log tree. - * eb, slot and key refer to the buffer and key found in the log tree. - * root is the destination we are replaying into, and path is for temp - * use by this function. (it should be released on return). - */ -static noinline int add_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) -{ - struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; - struct inode *dir; - struct inode *inode; - unsigned long ref_ptr; - unsigned long ref_end; - char *name; - int namelen; - int ret; - int search_done = 0; - - /* - * it is possible that we didn't log all the parent directories - * for a given inode. If we don't find the dir, just don't - * copy the back ref in. The link count fixup code will take - * care of the rest - */ - dir = read_one_inode(root, key->offset); - if (!dir) - return -ENOENT; - - inode = read_one_inode(root, key->objectid); - if (!inode) { - iput(dir); - return -EIO; - } - - ref_ptr = btrfs_item_ptr_offset(eb, slot); - ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); - -again: - ref = (struct btrfs_inode_ref *)ref_ptr; - - namelen = btrfs_inode_ref_name_len(eb, ref); - name = kmalloc(namelen, GFP_NOFS); - BUG_ON(!name); - - read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); - - /* if we already have a perfect match, we're done */ - if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), - btrfs_inode_ref_index(eb, ref), - name, namelen)) { - goto out; - } - - /* - * look for a conflicting back reference in the metadata. - * if we find one we have to unlink that name of the file - * before we add our new link. Later on, we overwrite any - * existing back reference, and we don't want to create - * dangling pointers in the directory. - */ - - if (search_done) - goto insert; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret == 0) { - char *victim_name; - int victim_name_len; - struct btrfs_inode_ref *victim_ref; - unsigned long ptr; - unsigned long ptr_end; - struct extent_buffer *leaf = path->nodes[0]; - - /* are we trying to overwrite a back ref for the root directory - * if so, just jump out, we're done - */ - if (key->objectid == key->offset) - goto out_nowrite; - - /* check all the names in this back reference to see - * if they are in the log. if so, we allow them to stay - * otherwise they must be unlinked as a conflict - */ - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); - while (ptr < ptr_end) { - victim_ref = (struct btrfs_inode_ref *)ptr; - victim_name_len = btrfs_inode_ref_name_len(leaf, - victim_ref); - victim_name = kmalloc(victim_name_len, GFP_NOFS); - BUG_ON(!victim_name); - - read_extent_buffer(leaf, victim_name, - (unsigned long)(victim_ref + 1), - victim_name_len); - - if (!backref_in_log(log, key, victim_name, - victim_name_len)) { - btrfs_inc_nlink(inode); - btrfs_release_path(path); - - ret = btrfs_unlink_inode(trans, root, dir, - inode, victim_name, - victim_name_len); - btrfs_run_delayed_items(trans, root); - } - kfree(victim_name); - ptr = (unsigned long)(victim_ref + 1) + victim_name_len; - } - BUG_ON(ret); - - /* - * NOTE: we have searched root tree and checked the - * coresponding ref, it does not need to check again. - */ - search_done = 1; - } - btrfs_release_path(path); - - /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); - } - btrfs_release_path(path); - - /* look for a conflicing name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), - name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); - } - btrfs_release_path(path); - -insert: - /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, - btrfs_inode_ref_index(eb, ref)); - BUG_ON(ret); - - btrfs_update_inode(trans, root, inode); - -out: - ref_ptr = (unsigned long)(ref + 1) + namelen; - kfree(name); - if (ref_ptr < ref_end) - goto again; - - /* finally write the back reference in the inode */ - ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); - -out_nowrite: - btrfs_release_path(path); - iput(dir); - iput(inode); - return 0; -} - -static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) -{ - int ret; - ret = btrfs_find_orphan_item(root, offset); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); - return ret; -} - - -/* - * There are a few corners where the link count of the file can't - * be properly maintained during replay. So, instead of adding - * lots of complexity to the log code, we just scan the backrefs - * for any file that has been through replay. - * - * The scan will update the link count on the inode to reflect the - * number of back refs found. If it goes down to zero, the iput - * will free the inode. - */ -static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode) -{ - struct btrfs_path *path; - int ret; - struct btrfs_key key; - u64 nlink = 0; - unsigned long ptr; - unsigned long ptr_end; - int name_len; - u64 ino = btrfs_ino(inode); - - key.objectid = ino; - key.type = BTRFS_INODE_REF_KEY; - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - break; - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } - btrfs_item_key_to_cpu(path->nodes[0], &key, - path->slots[0]); - if (key.objectid != ino || - key.type != BTRFS_INODE_REF_KEY) - break; - ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], - path->slots[0]); - while (ptr < ptr_end) { - struct btrfs_inode_ref *ref; - - ref = (struct btrfs_inode_ref *)ptr; - name_len = btrfs_inode_ref_name_len(path->nodes[0], - ref); - ptr = (unsigned long)(ref + 1) + name_len; - nlink++; - } - - if (key.offset == 0) - break; - key.offset--; - btrfs_release_path(path); - } - btrfs_release_path(path); - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - btrfs_update_inode(trans, root, inode); - } - BTRFS_I(inode)->index_cnt = (u64)-1; - - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - ino, 1); - BUG_ON(ret); - } - ret = insert_orphan_item(trans, root, ino); - BUG_ON(ret); - } - btrfs_free_path(path); - - return 0; -} - -static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) -{ - int ret; - struct btrfs_key key; - struct inode *inode; - - key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; - key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = (u64)-1; - while (1) { - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - break; - - if (ret == 1) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || - key.type != BTRFS_ORPHAN_ITEM_KEY) - break; - - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; - - btrfs_release_path(path); - inode = read_one_inode(root, key.offset); - if (!inode) - return -EIO; - - ret = fixup_inode_link_count(trans, root, inode); - BUG_ON(ret); - - iput(inode); - - /* - * fixup on a directory may create new entries, - * make sure we always look for the highset possible - * offset - */ - key.offset = (u64)-1; - } - ret = 0; -out: - btrfs_release_path(path); - return ret; -} - - -/* - * record a given inode in the fixup dir so we can check its link - * count when replay is done. The link count is incremented here - * so the inode won't go away until we check it - */ -static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 objectid) -{ - struct btrfs_key key; - int ret = 0; - struct inode *inode; - - inode = read_one_inode(root, objectid); - if (!inode) - return -EIO; - - key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); - key.offset = objectid; - - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_release_path(path); - if (ret == 0) { - btrfs_inc_nlink(inode); - btrfs_update_inode(trans, root, inode); - } else if (ret == -EEXIST) { - ret = 0; - } else { - BUG(); - } - iput(inode); - - return ret; -} - -/* - * when replaying the log for a directory, we only insert names - * for inodes that actually exist. This means an fsync on a directory - * does not implicitly fsync all the new files in it - */ -static noinline int insert_one_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 dirid, u64 index, - char *name, int name_len, u8 type, - struct btrfs_key *location) -{ - struct inode *inode; - struct inode *dir; - int ret; - - inode = read_one_inode(root, location->objectid); - if (!inode) - return -ENOENT; - - dir = read_one_inode(root, dirid); - if (!dir) { - iput(inode); - return -EIO; - } - ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); - - /* FIXME, put inode into FIXUP list */ - - iput(inode); - iput(dir); - return ret; -} - -/* - * take a single entry in a log directory item and replay it into - * the subvolume. - * - * if a conflicting item exists in the subdirectory already, - * the inode it points to is unlinked and put into the link count - * fix up tree. - * - * If a name from the log points to a file or directory that does - * not exist in the FS, it is skipped. fsyncs on directories - * do not force down inodes inside that directory, just changes to the - * names or unlinks in a directory. - */ -static noinline int replay_one_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, - struct btrfs_dir_item *di, - struct btrfs_key *key) -{ - char *name; - int name_len; - struct btrfs_dir_item *dst_di; - struct btrfs_key found_key; - struct btrfs_key log_key; - struct inode *dir; - u8 log_type; - int exists; - int ret; - - dir = read_one_inode(root, key->objectid); - if (!dir) - return -EIO; - - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) - return -ENOMEM; - - log_type = btrfs_dir_type(eb, di); - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - - btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; - btrfs_release_path(path); - - if (key->type == BTRFS_DIR_ITEM_KEY) { - dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); - } else if (key->type == BTRFS_DIR_INDEX_KEY) { - dst_di = btrfs_lookup_dir_index_item(trans, root, path, - key->objectid, - key->offset, name, - name_len, 1); - } else { - BUG(); - } - if (IS_ERR_OR_NULL(dst_di)) { - /* we need a sequence number to insert, so we only - * do inserts for the BTRFS_DIR_INDEX_KEY types - */ - if (key->type != BTRFS_DIR_INDEX_KEY) - goto out; - goto insert; - } - - btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); - /* the existing item matches the logged item */ - if (found_key.objectid == log_key.objectid && - found_key.type == log_key.type && - found_key.offset == log_key.offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) { - goto out; - } - - /* - * don't drop the conflicting directory entry if the inode - * for the new entry doesn't exist - */ - if (!exists) - goto out; - - ret = drop_one_dir_item(trans, root, path, dir, dst_di); - BUG_ON(ret); - - if (key->type == BTRFS_DIR_INDEX_KEY) - goto insert; -out: - btrfs_release_path(path); - kfree(name); - iput(dir); - return 0; - -insert: - btrfs_release_path(path); - ret = insert_one_name(trans, root, path, key->objectid, key->offset, - name, name_len, log_type, &log_key); - - BUG_ON(ret && ret != -ENOENT); - goto out; -} - -/* - * find all the names in a directory item and reconcile them into - * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than - * one name in a directory item, but the same code gets used for - * both directory index types - */ -static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) -{ - int ret; - u32 item_size = btrfs_item_size_nr(eb, slot); - struct btrfs_dir_item *di; - int name_len; - unsigned long ptr; - unsigned long ptr_end; - - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(root, eb, di)) - return -EIO; - name_len = btrfs_dir_name_len(eb, di); - ret = replay_one_name(trans, root, path, eb, di, key); - BUG_ON(ret); - ptr = (unsigned long)(di + 1); - ptr += name_len; - } - return 0; -} - -/* - * directory replay has two parts. There are the standard directory - * items in the log copied from the subvolume, and range items - * created in the log while the subvolume was logged. - * - * The range items tell us which parts of the key space the log - * is authoritative for. During replay, if a key in the subvolume - * directory is in a logged range item, but not actually in the log - * that means it was deleted from the directory before the fsync - * and should be removed. - */ -static noinline int find_dir_range(struct btrfs_root *root, - struct btrfs_path *path, - u64 dirid, int key_type, - u64 *start_ret, u64 *end_ret) -{ - struct btrfs_key key; - u64 found_end; - struct btrfs_dir_log_item *item; - int ret; - int nritems; - - if (*start_ret == (u64)-1) - return 1; - - key.objectid = dirid; - key.type = key_type; - key.offset = *start_ret; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - if (path->slots[0] == 0) - goto out; - path->slots[0]--; - } - if (ret != 0) - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - - if (key.type != key_type || key.objectid != dirid) { - ret = 1; - goto next; - } - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_dir_log_item); - found_end = btrfs_dir_log_end(path->nodes[0], item); - - if (*start_ret >= key.offset && *start_ret <= found_end) { - ret = 0; - *start_ret = key.offset; - *end_ret = found_end; - goto out; - } - ret = 1; -next: - /* check the next slot in the tree to see if it is a valid item */ - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - } else { - path->slots[0]++; - } - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - - if (key.type != key_type || key.objectid != dirid) { - ret = 1; - goto out; - } - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_dir_log_item); - found_end = btrfs_dir_log_end(path->nodes[0], item); - *start_ret = key.offset; - *end_ret = found_end; - ret = 0; -out: - btrfs_release_path(path); - return ret; -} - -/* - * this looks for a given directory item in the log. If the directory - * item is not in the log, the item is removed and the inode it points - * to is unlinked - */ -static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - struct btrfs_path *log_path, - struct inode *dir, - struct btrfs_key *dir_key) -{ - int ret; - struct extent_buffer *eb; - int slot; - u32 item_size; - struct btrfs_dir_item *di; - struct btrfs_dir_item *log_di; - int name_len; - unsigned long ptr; - unsigned long ptr_end; - char *name; - struct inode *inode; - struct btrfs_key location; - -again: - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size_nr(eb, slot); - ptr = btrfs_item_ptr_offset(eb, slot); - ptr_end = ptr + item_size; - while (ptr < ptr_end) { - di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(root, eb, di)) { - ret = -EIO; - goto out; - } - - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; - goto out; - } - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - log_di = NULL; - if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { - log_di = btrfs_lookup_dir_item(trans, log, log_path, - dir_key->objectid, - name, name_len, 0); - } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { - log_di = btrfs_lookup_dir_index_item(trans, log, - log_path, - dir_key->objectid, - dir_key->offset, - name, name_len, 0); - } - if (IS_ERR_OR_NULL(log_di)) { - btrfs_dir_item_key_to_cpu(eb, di, &location); - btrfs_release_path(path); - btrfs_release_path(log_path); - inode = read_one_inode(root, location.objectid); - if (!inode) { - kfree(name); - return -EIO; - } - - ret = link_to_fixup_dir(trans, root, - path, location.objectid); - BUG_ON(ret); - btrfs_inc_nlink(inode); - ret = btrfs_unlink_inode(trans, root, dir, inode, - name, name_len); - BUG_ON(ret); - - btrfs_run_delayed_items(trans, root); - - kfree(name); - iput(inode); - - /* there might still be more names under this key - * check and repeat if required - */ - ret = btrfs_search_slot(NULL, root, dir_key, path, - 0, 0); - if (ret == 0) - goto again; - ret = 0; - goto out; - } - btrfs_release_path(log_path); - kfree(name); - - ptr = (unsigned long)(di + 1); - ptr += name_len; - } - ret = 0; -out: - btrfs_release_path(path); - btrfs_release_path(log_path); - return ret; -} - -/* - * deletion replay happens before we copy any new directory items - * out of the log or out of backreferences from inodes. It - * scans the log to find ranges of keys that log is authoritative for, - * and then scans the directory to find items in those ranges that are - * not present in the log. - * - * Anything we don't find in the log is unlinked and removed from the - * directory. - */ -static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *log, - struct btrfs_path *path, - u64 dirid, int del_all) -{ - u64 range_start; - u64 range_end; - int key_type = BTRFS_DIR_LOG_ITEM_KEY; - int ret = 0; - struct btrfs_key dir_key; - struct btrfs_key found_key; - struct btrfs_path *log_path; - struct inode *dir; - - dir_key.objectid = dirid; - dir_key.type = BTRFS_DIR_ITEM_KEY; - log_path = btrfs_alloc_path(); - if (!log_path) - return -ENOMEM; - - dir = read_one_inode(root, dirid); - /* it isn't an error if the inode isn't there, that can happen - * because we replay the deletes before we copy in the inode item - * from the log - */ - if (!dir) { - btrfs_free_path(log_path); - return 0; - } -again: - range_start = 0; - range_end = 0; - while (1) { - if (del_all) - range_end = (u64)-1; - else { - ret = find_dir_range(log, path, dirid, key_type, - &range_start, &range_end); - if (ret != 0) - break; - } - - dir_key.offset = range_start; - while (1) { - int nritems; - ret = btrfs_search_slot(NULL, root, &dir_key, path, - 0, 0); - if (ret < 0) - goto out; - - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != dirid || - found_key.type != dir_key.type) - goto next_type; - - if (found_key.offset > range_end) - break; - - ret = check_item_in_log(trans, root, log, path, - log_path, dir, - &found_key); - BUG_ON(ret); - if (found_key.offset == (u64)-1) - break; - dir_key.offset = found_key.offset + 1; - } - btrfs_release_path(path); - if (range_end == (u64)-1) - break; - range_start = range_end + 1; - } - -next_type: - ret = 0; - if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { - key_type = BTRFS_DIR_LOG_INDEX_KEY; - dir_key.type = BTRFS_DIR_INDEX_KEY; - btrfs_release_path(path); - goto again; - } -out: - btrfs_release_path(path); - btrfs_free_path(log_path); - iput(dir); - return ret; -} - -/* - * the process_func used to replay items from the log tree. This - * gets called in two different stages. The first stage just looks - * for inodes and makes sure they are all copied into the subvolume. - * - * The second stage copies all the other item types from the log into - * the subvolume. The two stage approach is slower, but gets rid of - * lots of complexity around inodes referencing other inodes that exist - * only in the log (references come from either directory items or inode - * back refs). - */ -static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, - struct walk_control *wc, u64 gen) -{ - int nritems; - struct btrfs_path *path; - struct btrfs_root *root = wc->replay_dest; - struct btrfs_key key; - int level; - int i; - int ret; - - btrfs_read_buffer(eb, gen); - - level = btrfs_header_level(eb); - - if (level != 0) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - nritems = btrfs_header_nritems(eb); - for (i = 0; i < nritems; i++) { - btrfs_item_key_to_cpu(eb, &key, i); - - /* inode keys are done during the first stage */ - if (key.type == BTRFS_INODE_ITEM_KEY && - wc->stage == LOG_WALK_REPLAY_INODES) { - struct btrfs_inode_item *inode_item; - u32 mode; - - inode_item = btrfs_item_ptr(eb, i, - struct btrfs_inode_item); - mode = btrfs_inode_mode(eb, inode_item); - if (S_ISDIR(mode)) { - ret = replay_dir_deletes(wc->trans, - root, log, path, key.objectid, 0); - BUG_ON(ret); - } - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); - BUG_ON(ret); - - /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF - * will be truncated later by orphan cleanup. - */ - if (S_ISREG(mode)) { - ret = insert_orphan_item(wc->trans, root, - key.objectid); - BUG_ON(ret); - } - - ret = link_to_fixup_dir(wc->trans, root, - path, key.objectid); - BUG_ON(ret); - } - if (wc->stage < LOG_WALK_REPLAY_ALL) - continue; - - /* these keys are simply copied */ - if (key.type == BTRFS_XATTR_ITEM_KEY) { - ret = overwrite_item(wc->trans, root, path, - eb, i, &key); - BUG_ON(ret); - } else if (key.type == BTRFS_INODE_REF_KEY) { - ret = add_inode_ref(wc->trans, root, log, path, - eb, i, &key); - BUG_ON(ret && ret != -ENOENT); - } else if (key.type == BTRFS_EXTENT_DATA_KEY) { - ret = replay_one_extent(wc->trans, root, path, - eb, i, &key); - BUG_ON(ret); - } else if (key.type == BTRFS_DIR_ITEM_KEY || - key.type == BTRFS_DIR_INDEX_KEY) { - ret = replay_one_dir_item(wc->trans, root, path, - eb, i, &key); - BUG_ON(ret); - } - } - btrfs_free_path(path); - return 0; -} - -static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) -{ - u64 root_owner; - u64 bytenr; - u64 ptr_gen; - struct extent_buffer *next; - struct extent_buffer *cur; - struct extent_buffer *parent; - u32 blocksize; - int ret = 0; - - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - - while (*level > 0) { - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - cur = path->nodes[*level]; - - if (btrfs_header_level(cur) != *level) - WARN_ON(1); - - if (path->slots[*level] >= - btrfs_header_nritems(cur)) - break; - - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - - next = btrfs_find_create_tree_block(root, bytenr, blocksize); - if (!next) - return -ENOMEM; - - if (*level == 1) { - ret = wc->process_func(root, next, wc, ptr_gen); - if (ret) - return ret; - - path->slots[*level]++; - if (wc->free) { - btrfs_read_buffer(next, ptr_gen); - - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(root_owner != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(root, - bytenr, blocksize); - BUG_ON(ret); /* -ENOMEM or logic errors */ - } - free_extent_buffer(next); - continue; - } - btrfs_read_buffer(next, ptr_gen); - - WARN_ON(*level <= 0); - if (path->nodes[*level-1]) - free_extent_buffer(path->nodes[*level-1]); - path->nodes[*level-1] = next; - *level = btrfs_header_level(next); - path->slots[*level] = 0; - cond_resched(); - } - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - - path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); - - cond_resched(); - return 0; -} - -static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level, - struct walk_control *wc) -{ - u64 root_owner; - int i; - int slot; - int ret; - - for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { - slot = path->slots[i]; - if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { - path->slots[i]++; - *level = i; - WARN_ON(*level == 0); - return 0; - } else { - struct extent_buffer *parent; - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - root_owner = btrfs_header_owner(parent); - ret = wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); - if (ret) - return ret; - - if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[*level]; - - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(root, - path->nodes[*level]->start, - path->nodes[*level]->len); - BUG_ON(ret); - } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level = i + 1; - } - } - return 1; -} - -/* - * drop the reference count on the tree rooted at 'snap'. This traverses - * the tree freeing any blocks that have a ref count of zero after being - * decremented. - */ -static int walk_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log, struct walk_control *wc) -{ - int ret = 0; - int wret; - int level; - struct btrfs_path *path; - int i; - int orig_level; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - level = btrfs_header_level(log->node); - orig_level = level; - path->nodes[level] = log->node; - extent_buffer_get(log->node); - path->slots[level] = 0; - - while (1) { - wret = walk_down_log_tree(trans, log, path, &level, wc); - if (wret > 0) - break; - if (wret < 0) { - ret = wret; - goto out; - } - - wret = walk_up_log_tree(trans, log, path, &level, wc); - if (wret > 0) - break; - if (wret < 0) { - ret = wret; - goto out; - } - } - - /* was the root node processed? if not, catch it here */ - if (path->nodes[orig_level]) { - ret = wc->process_func(log, path->nodes[orig_level], wc, - btrfs_header_generation(path->nodes[orig_level])); - if (ret) - goto out; - if (wc->free) { - struct extent_buffer *next; - - next = path->nodes[orig_level]; - - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(log->root_key.objectid != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(log, next->start, - next->len); - BUG_ON(ret); /* -ENOMEM or logic errors */ - } - } - -out: - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } - btrfs_free_path(path); - return ret; -} - -/* - * helper function to update the item for a given subvolumes log root - * in the tree of log roots - */ -static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) -{ - int ret; - - if (log->log_transid == 1) { - /* insert root item on the first sync */ - ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, - &log->root_key, &log->root_item); - } else { - ret = btrfs_update_root(trans, log->fs_info->log_root_tree, - &log->root_key, &log->root_item); - } - return ret; -} - -static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) -{ - DEFINE_WAIT(wait); - int index = transid % 2; - - /* - * we only allow two pending log transactions at a time, - * so we know that if ours is more than 2 older than the - * current transaction, we're done - */ - do { - prepare_to_wait(&root->log_commit_wait[index], - &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&root->log_mutex); - - if (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && - atomic_read(&root->log_commit[index])) - schedule(); - - finish_wait(&root->log_commit_wait[index], &wait); - mutex_lock(&root->log_mutex); - } while (root->fs_info->last_trans_log_full_commit != - trans->transid && root->log_transid < transid + 2 && - atomic_read(&root->log_commit[index])); - return 0; -} - -static void wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - DEFINE_WAIT(wait); - while (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) { - prepare_to_wait(&root->log_writer_wait, - &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&root->log_mutex); - if (root->fs_info->last_trans_log_full_commit != - trans->transid && atomic_read(&root->log_writers)) - schedule(); - mutex_lock(&root->log_mutex); - finish_wait(&root->log_writer_wait, &wait); - } -} - -/* - * btrfs_sync_log does sends a given tree log down to the disk and - * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk only - * if it returns 0. - * - * Any other return value means you need to call btrfs_commit_transaction. - * Some of the edge cases for fsyncing directories that have had unlinks - * or renames done in the past mean that sometimes the only safe - * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, - * that has happened. - */ -int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int index1; - int index2; - int mark; - int ret; - struct btrfs_root *log = root->log_root; - struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - unsigned long log_transid = 0; - - mutex_lock(&root->log_mutex); - index1 = root->log_transid % 2; - if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, root->log_transid); - mutex_unlock(&root->log_mutex); - return 0; - } - atomic_set(&root->log_commit[index1], 1); - - /* wait for previous tree log sync to complete */ - if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root->log_transid - 1); - while (1) { - unsigned long batch = root->log_batch; - /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - } - wait_for_writer(trans, root); - if (batch == root->log_batch) - break; - } - - /* bail out if we need to do a full commit */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { - ret = -EAGAIN; - mutex_unlock(&root->log_mutex); - goto out; - } - - log_transid = root->log_transid; - if (log_transid % 2 == 0) - mark = EXTENT_DIRTY; - else - mark = EXTENT_NEW; - - /* we start IO on all the marked extents here, but we don't actually - * wait for them until later. - */ - ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - mutex_unlock(&root->log_mutex); - goto out; - } - - btrfs_set_root_node(&log->root_item, log->node); - - root->log_batch = 0; - root->log_transid++; - log->log_transid = root->log_transid; - root->log_start_pid = 0; - smp_mb(); - /* - * IO has been started, blocks of the log tree have WRITTEN flag set - * in their headers. new modifications of the log will be written to - * new positions. so it's safe to allow log writers to go in. - */ - mutex_unlock(&root->log_mutex); - - mutex_lock(&log_root_tree->log_mutex); - log_root_tree->log_batch++; - atomic_inc(&log_root_tree->log_writers); - mutex_unlock(&log_root_tree->log_mutex); - - ret = update_log_root(trans, log); - - mutex_lock(&log_root_tree->log_mutex); - if (atomic_dec_and_test(&log_root_tree->log_writers)) { - smp_mb(); - if (waitqueue_active(&log_root_tree->log_writer_wait)) - wake_up(&log_root_tree->log_writer_wait); - } - - if (ret) { - if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); - mutex_unlock(&log_root_tree->log_mutex); - goto out; - } - root->fs_info->last_trans_log_full_commit = trans->transid; - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; - goto out; - } - - index2 = log_root_tree->log_transid % 2; - if (atomic_read(&log_root_tree->log_commit[index2])) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid); - mutex_unlock(&log_root_tree->log_mutex); - ret = 0; - goto out; - } - atomic_set(&log_root_tree->log_commit[index2], 1); - - if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { - wait_log_commit(trans, log_root_tree, - log_root_tree->log_transid - 1); - } - - wait_for_writer(trans, log_root_tree); - - /* - * now that we've moved on to the tree of log tree roots, - * check the full commit flag again - */ - if (root->fs_info->last_trans_log_full_commit == trans->transid) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; - goto out_wake_log_root; - } - - ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_DIRTY | EXTENT_NEW); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - mutex_unlock(&log_root_tree->log_mutex); - goto out_wake_log_root; - } - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - - btrfs_set_super_log_root(root->fs_info->super_for_commit, - log_root_tree->node->start); - btrfs_set_super_log_root_level(root->fs_info->super_for_commit, - btrfs_header_level(log_root_tree->node)); - - log_root_tree->log_batch = 0; - log_root_tree->log_transid++; - smp_mb(); - - mutex_unlock(&log_root_tree->log_mutex); - - /* - * nobody else is going to jump in and write the the ctree - * super here because the log_commit atomic below is protecting - * us. We must be called with a transaction handle pinning - * the running transaction open, so a full commit can't hop - * in and cause problems either. - */ - btrfs_scrub_pause_super(root); - write_ctree_super(trans, root->fs_info->tree_root, 1); - btrfs_scrub_continue_super(root); - ret = 0; - - mutex_lock(&root->log_mutex); - if (root->last_log_commit < log_transid) - root->last_log_commit = log_transid; - mutex_unlock(&root->log_mutex); - -out_wake_log_root: - atomic_set(&log_root_tree->log_commit[index2], 0); - smp_mb(); - if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) - wake_up(&log_root_tree->log_commit_wait[index2]); -out: - atomic_set(&root->log_commit[index1], 0); - smp_mb(); - if (waitqueue_active(&root->log_commit_wait[index1])) - wake_up(&root->log_commit_wait[index1]); - return ret; -} - -static void free_log_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *log) -{ - int ret; - u64 start; - u64 end; - struct walk_control wc = { - .free = 1, - .process_func = process_one_buffer - }; - - ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); - - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); - } - - free_extent_buffer(log->node); - kfree(log); -} - -/* - * free all the extents used by the tree log. This should be called - * at commit time of the full transaction - */ -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) -{ - if (root->log_root) { - free_log_tree(trans, root->log_root); - root->log_root = NULL; - } - return 0; -} - -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - if (fs_info->log_root_tree) { - free_log_tree(trans, fs_info->log_root_tree); - fs_info->log_root_tree = NULL; - } - return 0; -} - -/* - * If both a file and directory are logged, and unlinks or renames are - * mixed in, we have a few interesting corners: - * - * create file X in dir Y - * link file X to X.link in dir Y - * fsync file X - * unlink file X but leave X.link - * fsync dir Y - * - * After a crash we would expect only X.link to exist. But file X - * didn't get fsync'd again so the log has back refs for X and X.link. - * - * We solve this by removing directory entries and inode backrefs from the - * log when a file that was logged in the current transaction is - * unlinked. Any later fsync will include the updated log entries, and - * we'll be able to reconstruct the proper directory items from backrefs. - * - * This optimizations allows us to avoid relogging the entire inode - * or the entire directory. - */ -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct inode *dir, u64 index) -{ - struct btrfs_root *log; - struct btrfs_dir_item *di; - struct btrfs_path *path; - int ret; - int err = 0; - int bytes_del = 0; - u64 dir_ino = btrfs_ino(dir); - - if (BTRFS_I(dir)->logged_trans < trans->transid) - return 0; - - ret = join_running_log_trans(root); - if (ret) - return 0; - - mutex_lock(&BTRFS_I(dir)->log_mutex); - - log = root->log_root; - path = btrfs_alloc_path(); - if (!path) { - err = -ENOMEM; - goto out_unlock; - } - - di = btrfs_lookup_dir_item(trans, log, path, dir_ino, - name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; - BUG_ON(ret); - } - btrfs_release_path(path); - di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); - if (IS_ERR(di)) { - err = PTR_ERR(di); - goto fail; - } - if (di) { - ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; - BUG_ON(ret); - } - - /* update the directory size in the log to reflect the names - * we have removed - */ - if (bytes_del) { - struct btrfs_key key; - - key.objectid = dir_ino; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(path); - - ret = btrfs_search_slot(trans, log, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 0) { - struct btrfs_inode_item *item; - u64 i_size; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - i_size = btrfs_inode_size(path->nodes[0], item); - if (i_size > bytes_del) - i_size -= bytes_del; - else - i_size = 0; - btrfs_set_inode_size(path->nodes[0], item, i_size); - btrfs_mark_buffer_dirty(path->nodes[0]); - } else - ret = 0; - btrfs_release_path(path); - } -fail: - btrfs_free_path(path); -out_unlock: - mutex_unlock(&BTRFS_I(dir)->log_mutex); - if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; - ret = 0; - } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); - - btrfs_end_log_trans(root); - - return err; -} - -/* see comments for btrfs_del_dir_entries_in_log */ -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct inode *inode, u64 dirid) -{ - struct btrfs_root *log; - u64 index; - int ret; - - if (BTRFS_I(inode)->logged_trans < trans->transid) - return 0; - - ret = join_running_log_trans(root); - if (ret) - return 0; - log = root->log_root; - mutex_lock(&BTRFS_I(inode)->log_mutex); - - ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), - dirid, &index); - mutex_unlock(&BTRFS_I(inode)->log_mutex); - if (ret == -ENOSPC) { - root->fs_info->last_trans_log_full_commit = trans->transid; - ret = 0; - } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); - btrfs_end_log_trans(root); - - return ret; -} - -/* - * creates a range item in the log for 'dirid'. first_offset and - * last_offset tell us which parts of the key space the log should - * be considered authoritative for. - */ -static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, - struct btrfs_root *log, - struct btrfs_path *path, - int key_type, u64 dirid, - u64 first_offset, u64 last_offset) -{ - int ret; - struct btrfs_key key; - struct btrfs_dir_log_item *item; - - key.objectid = dirid; - key.offset = first_offset; - if (key_type == BTRFS_DIR_ITEM_KEY) - key.type = BTRFS_DIR_LOG_ITEM_KEY; - else - key.type = BTRFS_DIR_LOG_INDEX_KEY; - ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - if (ret) - return ret; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_dir_log_item); - btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); - return 0; -} - -/* - * log all the items included in the current transaction for a given - * directory. This also creates the range items in the log tree required - * to replay anything deleted before the fsync - */ -static noinline int log_dir_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path, - struct btrfs_path *dst_path, int key_type, - u64 min_offset, u64 *last_offset_ret) -{ - struct btrfs_key min_key; - struct btrfs_key max_key; - struct btrfs_root *log = root->log_root; - struct extent_buffer *src; - int err = 0; - int ret; - int i; - int nritems; - u64 first_offset = min_offset; - u64 last_offset = (u64)-1; - u64 ino = btrfs_ino(inode); - - log = root->log_root; - max_key.objectid = ino; - max_key.offset = (u64)-1; - max_key.type = key_type; - - min_key.objectid = ino; - min_key.type = key_type; - min_key.offset = min_offset; - - path->keep_locks = 1; - - ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); - - /* - * we didn't find anything from this transaction, see if there - * is anything at all - */ - if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { - min_key.objectid = ino; - min_key.type = key_type; - min_key.offset = (u64)-1; - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret < 0) { - btrfs_release_path(path); - return ret; - } - ret = btrfs_previous_item(root, path, ino, key_type); - - /* if ret == 0 there are items for this type, - * create a range to tell us the last key of this type. - * otherwise, there are no items in this directory after - * *min_offset, and we create a range to indicate that. - */ - if (ret == 0) { - struct btrfs_key tmp; - btrfs_item_key_to_cpu(path->nodes[0], &tmp, - path->slots[0]); - if (key_type == tmp.type) - first_offset = max(min_offset, tmp.offset) + 1; - } - goto done; - } - - /* go backward to find any previous key */ - ret = btrfs_previous_item(root, path, ino, key_type); - if (ret == 0) { - struct btrfs_key tmp; - btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (key_type == tmp.type) { - first_offset = tmp.offset; - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &tmp); - if (ret) { - err = ret; - goto done; - } - } - } - btrfs_release_path(path); - - /* find the first key from this transaction again */ - ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (ret != 0) { - WARN_ON(1); - goto done; - } - - /* - * we have a block from this transaction, log every item in it - * from our directory - */ - while (1) { - struct btrfs_key tmp; - src = path->nodes[0]; - nritems = btrfs_header_nritems(src); - for (i = path->slots[0]; i < nritems; i++) { - btrfs_item_key_to_cpu(src, &min_key, i); - - if (min_key.objectid != ino || min_key.type != key_type) - goto done; - ret = overwrite_item(trans, log, dst_path, src, i, - &min_key); - if (ret) { - err = ret; - goto done; - } - } - path->slots[0] = nritems; - - /* - * look ahead to the next item and see if it is also - * from this directory and from this transaction - */ - ret = btrfs_next_leaf(root, path); - if (ret == 1) { - last_offset = (u64)-1; - goto done; - } - btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.objectid != ino || tmp.type != key_type) { - last_offset = (u64)-1; - goto done; - } - if (btrfs_header_generation(path->nodes[0]) != trans->transid) { - ret = overwrite_item(trans, log, dst_path, - path->nodes[0], path->slots[0], - &tmp); - if (ret) - err = ret; - else - last_offset = tmp.offset; - goto done; - } - } -done: - btrfs_release_path(path); - btrfs_release_path(dst_path); - - if (err == 0) { - *last_offset_ret = last_offset; - /* - * insert the log range keys to indicate where the log - * is valid - */ - ret = insert_dir_log_key(trans, log, path, key_type, - ino, first_offset, last_offset); - if (ret) - err = ret; - } - return err; -} - -/* - * logging directories is very similar to logging inodes, We find all the items - * from the current transaction and write them to the log. - * - * The recovery code scans the directory in the subvolume, and if it finds a - * key in the range logged that is not present in the log tree, then it means - * that dir entry was unlinked during the transaction. - * - * In order for that scan to work, we must include one key smaller than - * the smallest logged by this transaction and one key larger than the largest - * key logged by this transaction. - */ -static noinline int log_directory_changes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct btrfs_path *path, - struct btrfs_path *dst_path) -{ - u64 min_key; - u64 max_key; - int ret; - int key_type = BTRFS_DIR_ITEM_KEY; - -again: - min_key = 0; - max_key = 0; - while (1) { - ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, min_key, - &max_key); - if (ret) - return ret; - if (max_key == (u64)-1) - break; - min_key = max_key + 1; - } - - if (key_type == BTRFS_DIR_ITEM_KEY) { - key_type = BTRFS_DIR_INDEX_KEY; - goto again; - } - return 0; -} - -/* - * a helper function to drop items from the log before we relog an - * inode. max_key_type indicates the highest item type to remove. - * This cannot be run for file data extents because it does not - * free the extents they point to. - */ -static int drop_objectid_items(struct btrfs_trans_handle *trans, - struct btrfs_root *log, - struct btrfs_path *path, - u64 objectid, int max_key_type) -{ - int ret; - struct btrfs_key key; - struct btrfs_key found_key; - - key.objectid = objectid; - key.type = max_key_type; - key.offset = (u64)-1; - - while (1) { - ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - BUG_ON(ret == 0); - if (ret < 0) - break; - - if (path->slots[0] == 0) - break; - - path->slots[0]--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - - if (found_key.objectid != objectid) - break; - - ret = btrfs_del_item(trans, log, path); - if (ret) - break; - btrfs_release_path(path); - } - btrfs_release_path(path); - return ret; -} - -static noinline int copy_items(struct btrfs_trans_handle *trans, - struct btrfs_root *log, - struct btrfs_path *dst_path, - struct extent_buffer *src, - int start_slot, int nr, int inode_only) -{ - unsigned long src_offset; - unsigned long dst_offset; - struct btrfs_file_extent_item *extent; - struct btrfs_inode_item *inode_item; - int ret; - struct btrfs_key *ins_keys; - u32 *ins_sizes; - char *ins_data; - int i; - struct list_head ordered_sums; - - INIT_LIST_HEAD(&ordered_sums); - - ins_data = kmalloc(nr * sizeof(struct btrfs_key) + - nr * sizeof(u32), GFP_NOFS); - if (!ins_data) - return -ENOMEM; - - ins_sizes = (u32 *)ins_data; - ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); - - for (i = 0; i < nr; i++) { - ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); - btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); - } - ret = btrfs_insert_empty_items(trans, log, dst_path, - ins_keys, ins_sizes, nr); - if (ret) { - kfree(ins_data); - return ret; - } - - for (i = 0; i < nr; i++, dst_path->slots[0]++) { - dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], - dst_path->slots[0]); - - src_offset = btrfs_item_ptr_offset(src, start_slot + i); - - copy_extent_buffer(dst_path->nodes[0], src, dst_offset, - src_offset, ins_sizes[i]); - - if (inode_only == LOG_INODE_EXISTS && - ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { - inode_item = btrfs_item_ptr(dst_path->nodes[0], - dst_path->slots[0], - struct btrfs_inode_item); - btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); - - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' - */ - btrfs_set_inode_generation(dst_path->nodes[0], - inode_item, 0); - } - /* take a reference on file data extents so that truncates - * or deletes of this inode don't have to relog the inode - * again - */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { - int found_type; - extent = btrfs_item_ptr(src, start_slot + i, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(src, extent) < trans->transid) - continue; - - found_type = btrfs_file_extent_type(src, extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds, dl, cs, cl; - ds = btrfs_file_extent_disk_bytenr(src, - extent); - /* ds == 0 is a hole */ - if (ds == 0) - continue; - - dl = btrfs_file_extent_disk_num_bytes(src, - extent); - cs = btrfs_file_extent_offset(src, extent); - cl = btrfs_file_extent_num_bytes(src, - extent); - if (btrfs_file_extent_compression(src, - extent)) { - cs = 0; - cl = dl; - } - - ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums, 0); - BUG_ON(ret); - } - } - } - - btrfs_mark_buffer_dirty(dst_path->nodes[0]); - btrfs_release_path(dst_path); - kfree(ins_data); - - /* - * we have to do this after the loop above to avoid changing the - * log tree while trying to change the log tree. - */ - ret = 0; - while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); - if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); - list_del(&sums->list); - kfree(sums); - } - return ret; -} - -/* log a single inode in the tree log. - * At least one parent directory for this inode must exist in the tree - * or be logged already. - * - * Any items from this inode changed by the current transaction are copied - * to the log tree. An extra reference is taken on any extents in this - * file, allowing us to avoid a whole pile of corner cases around logging - * blocks that have been removed from the tree. - * - * See LOG_INODE_ALL and related defines for a description of what inode_only - * does. - * - * This handles both files and directories. - */ -static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) -{ - struct btrfs_path *path; - struct btrfs_path *dst_path; - struct btrfs_key min_key; - struct btrfs_key max_key; - struct btrfs_root *log = root->log_root; - struct extent_buffer *src = NULL; - int err = 0; - int ret; - int nritems; - int ins_start_slot = 0; - int ins_nr; - u64 ino = btrfs_ino(inode); - - log = root->log_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - dst_path = btrfs_alloc_path(); - if (!dst_path) { - btrfs_free_path(path); - return -ENOMEM; - } - - min_key.objectid = ino; - min_key.type = BTRFS_INODE_ITEM_KEY; - min_key.offset = 0; - - max_key.objectid = ino; - - /* today the code can only do partial logging of directories */ - if (!S_ISDIR(inode->i_mode)) - inode_only = LOG_INODE_ALL; - - if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) - max_key.type = BTRFS_XATTR_ITEM_KEY; - else - max_key.type = (u8)-1; - max_key.offset = (u64)-1; - - ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; - } - - mutex_lock(&BTRFS_I(inode)->log_mutex); - - /* - * a brute force approach to making sure we get the most uptodate - * copies of everything. - */ - if (S_ISDIR(inode->i_mode)) { - int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, max_key_type); - } else { - ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); - } - if (ret) { - err = ret; - goto out_unlock; - } - path->keep_locks = 1; - - while (1) { - ins_nr = 0; - ret = btrfs_search_forward(root, &min_key, &max_key, - path, 0, trans->transid); - if (ret != 0) - break; -again: - /* note, ins_nr might be > 0 here, cleanup outside the loop */ - if (min_key.objectid != ino) - break; - if (min_key.type > max_key.type) - break; - - src = path->nodes[0]; - if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { - ins_nr++; - goto next_slot; - } else if (!ins_nr) { - ins_start_slot = path->slots[0]; - ins_nr = 1; - goto next_slot; - } - - ret = copy_items(trans, log, dst_path, src, ins_start_slot, - ins_nr, inode_only); - if (ret) { - err = ret; - goto out_unlock; - } - ins_nr = 1; - ins_start_slot = path->slots[0]; -next_slot: - - nritems = btrfs_header_nritems(path->nodes[0]); - path->slots[0]++; - if (path->slots[0] < nritems) { - btrfs_item_key_to_cpu(path->nodes[0], &min_key, - path->slots[0]); - goto again; - } - if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, - ins_nr, inode_only); - if (ret) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } - btrfs_release_path(path); - - if (min_key.offset < (u64)-1) - min_key.offset++; - else if (min_key.type < (u8)-1) - min_key.type++; - else if (min_key.objectid < (u64)-1) - min_key.objectid++; - else - break; - } - if (ins_nr) { - ret = copy_items(trans, log, dst_path, src, - ins_start_slot, - ins_nr, inode_only); - if (ret) { - err = ret; - goto out_unlock; - } - ins_nr = 0; - } - WARN_ON(ins_nr); - if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - btrfs_release_path(path); - btrfs_release_path(dst_path); - ret = log_directory_changes(trans, root, inode, path, dst_path); - if (ret) { - err = ret; - goto out_unlock; - } - } - BTRFS_I(inode)->logged_trans = trans->transid; -out_unlock: - mutex_unlock(&BTRFS_I(inode)->log_mutex); - - btrfs_free_path(path); - btrfs_free_path(dst_path); - return err; -} - -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct inode *inode, - struct dentry *parent, - struct super_block *sb, - u64 last_committed) -{ - int ret = 0; - struct btrfs_root *root; - struct dentry *old_parent = NULL; - - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. - */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto out; - - if (!S_ISDIR(inode->i_mode)) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) - goto out; - inode = parent->d_inode; - } - - while (1) { - BTRFS_I(inode)->logged_trans = trans->transid; - smp_mb(); - - if (BTRFS_I(inode)->last_unlink_trans > last_committed) { - root = BTRFS_I(inode)->root; - - /* - * make sure any commits to the log are forced - * to be full commits - */ - root->fs_info->last_trans_log_full_commit = - trans->transid; - ret = 1; - break; - } - - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) - break; - - if (IS_ROOT(parent)) - break; - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - inode = parent->d_inode; - - } - dput(old_parent); -out: - return ret; -} - -static int inode_in_log(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; - - mutex_lock(&root->log_mutex); - if (BTRFS_I(inode)->logged_trans == trans->transid && - BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) - ret = 1; - mutex_unlock(&root->log_mutex); - return ret; -} - - -/* - * helper function around btrfs_log_inode to make sure newly created - * parent directories also end up in the log. A minimal inode and backref - * only logging is done of any parent directories that are older than - * the last committed transaction - */ -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) -{ - int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; - struct super_block *sb; - struct dentry *old_parent = NULL; - int ret = 0; - u64 last_committed = root->fs_info->last_trans_committed; - - sb = inode->i_sb; - - if (btrfs_test_opt(root, NOTREELOG)) { - ret = 1; - goto end_no_trans; - } - - if (root->fs_info->last_trans_log_full_commit > - root->fs_info->last_trans_committed) { - ret = 1; - goto end_no_trans; - } - - if (root != BTRFS_I(inode)->root || - btrfs_root_refs(&root->root_item) == 0) { - ret = 1; - goto end_no_trans; - } - - ret = check_parent_dirs_for_sync(trans, inode, parent, - sb, last_committed); - if (ret) - goto end_no_trans; - - if (inode_in_log(trans, inode)) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } - - ret = start_log_trans(trans, root); - if (ret) - goto end_trans; - - ret = btrfs_log_inode(trans, root, inode, inode_only); - if (ret) - goto end_trans; - - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. - */ - if (S_ISREG(inode->i_mode) && - BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) { - ret = 0; - goto end_trans; - } - - inode_only = LOG_INODE_EXISTS; - while (1) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) - break; - - inode = parent->d_inode; - if (root != BTRFS_I(inode)->root) - break; - - if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { - ret = btrfs_log_inode(trans, root, inode, inode_only); - if (ret) - goto end_trans; - } - if (IS_ROOT(parent)) - break; - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - } - ret = 0; -end_trans: - dput(old_parent); - if (ret < 0) { - BUG_ON(ret != -ENOSPC); - root->fs_info->last_trans_log_full_commit = trans->transid; - ret = 1; - } - btrfs_end_log_trans(root); -end_no_trans: - return ret; -} - -/* - * it is not safe to log dentry if the chunk root has added new - * chunks. This returns 0 if the dentry was logged, and 1 otherwise. - * If this returns 1, you must commit the transaction to safely get your - * data on disk. - */ -int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry) -{ - struct dentry *parent = dget_parent(dentry); - int ret; - - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); - dput(parent); - - return ret; -} - -/* - * should be called during mount to recover any replay any log trees - * from the FS - */ -int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) -{ - int ret; - struct btrfs_path *path; - struct btrfs_trans_handle *trans; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_key tmp_key; - struct btrfs_root *log; - struct btrfs_fs_info *fs_info = log_root_tree->fs_info; - struct walk_control wc = { - .process_func = process_one_buffer, - .stage = 0, - }; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - fs_info->log_root_recovering = 1; - - trans = btrfs_start_transaction(fs_info->tree_root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto error; - } - - wc.trans = trans; - wc.pin = 1; - - ret = walk_log_tree(trans, log_root_tree, &wc); - if (ret) { - btrfs_error(fs_info, ret, "Failed to pin buffers while " - "recovering log root tree."); - goto error; - } - -again: - key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); - - while (1) { - ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); - - if (ret < 0) { - btrfs_error(fs_info, ret, - "Couldn't find tree log root."); - goto error; - } - if (ret > 0) { - if (path->slots[0] == 0) - break; - path->slots[0]--; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - btrfs_release_path(path); - if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) - break; - - log = btrfs_read_fs_root_no_radix(log_root_tree, - &found_key); - if (IS_ERR(log)) { - ret = PTR_ERR(log); - btrfs_error(fs_info, ret, - "Couldn't read tree log root."); - goto error; - } - - tmp_key.objectid = found_key.offset; - tmp_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_key.offset = (u64)-1; - - wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - if (IS_ERR(wc.replay_dest)) { - ret = PTR_ERR(wc.replay_dest); - btrfs_error(fs_info, ret, "Couldn't read target root " - "for tree log recovery."); - goto error; - } - - wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(trans, wc.replay_dest); - ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); - - if (wc.stage == LOG_WALK_REPLAY_ALL) { - ret = fixup_inode_link_counts(trans, wc.replay_dest, - path); - BUG_ON(ret); - } - - key.offset = found_key.offset - 1; - wc.replay_dest->log_root = NULL; - free_extent_buffer(log->node); - free_extent_buffer(log->commit_root); - kfree(log); - - if (found_key.offset == 0) - break; - } - btrfs_release_path(path); - - /* step one is to pin it all, step two is to replay just inodes */ - if (wc.pin) { - wc.pin = 0; - wc.process_func = replay_one_buffer; - wc.stage = LOG_WALK_REPLAY_INODES; - goto again; - } - /* step three is to replay everything */ - if (wc.stage < LOG_WALK_REPLAY_ALL) { - wc.stage++; - goto again; - } - - btrfs_free_path(path); - - free_extent_buffer(log_root_tree->node); - log_root_tree->log_root = NULL; - fs_info->log_root_recovering = 0; - - /* step 4: commit the transaction, which also unpins the blocks */ - btrfs_commit_transaction(trans, fs_info->tree_root); - - kfree(log_root_tree); - return 0; - -error: - btrfs_free_path(path); - return ret; -} - -/* - * there are some corner cases where we want to force a full - * commit instead of allowing a directory to be logged. - * - * They revolve around files there were unlinked from the directory, and - * this function updates the parent directory so that a full commit is - * properly done if it is fsync'd later after the unlinks are done. - */ -void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, - int for_rename) -{ - /* - * when we're logging a file, if it hasn't been renamed - * or unlinked, and its inode is fully committed on disk, - * we don't have to worry about walking up the directory chain - * to log its parents. - * - * So, we use the last_unlink_trans field to put this transid - * into the file. When the file is logged we check it and - * don't log the parents if the file is fully on disk. - */ - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->last_unlink_trans = trans->transid; - - /* - * if this directory was already logged any new - * names for this file/dir will get recorded - */ - smp_mb(); - if (BTRFS_I(dir)->logged_trans == trans->transid) - return; - - /* - * if the inode we're about to unlink was logged, - * the log will be properly updated for any new names - */ - if (BTRFS_I(inode)->logged_trans == trans->transid) - return; - - /* - * when renaming files across directories, if the directory - * there we're unlinking from gets fsync'd later on, there's - * no way to find the destination directory later and fsync it - * properly. So, we have to be conservative and force commits - * so the new name gets discovered. - */ - if (for_rename) - goto record; - - /* we can safely do the unlink without any special recording */ - return; - -record: - BTRFS_I(dir)->last_unlink_trans = trans->transid; -} - -/* - * Call this after adding a new name for a file and it will properly - * update the log to reflect the new name. - * - * It will return zero if all goes well, and it will return 1 if a - * full transaction commit is required. - */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *old_dir, - struct dentry *parent) -{ - struct btrfs_root * root = BTRFS_I(inode)->root; - - /* - * this will force the logging code to walk the dentry chain - * up for the file - */ - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->last_unlink_trans = trans->transid; - - /* - * if this inode hasn't been logged and directory we're renaming it - * from hasn't been logged, we don't need to log it - */ - if (BTRFS_I(inode)->logged_trans <= - root->fs_info->last_trans_committed && - (!old_dir || BTRFS_I(old_dir)->logged_trans <= - root->fs_info->last_trans_committed)) - return 0; - - return btrfs_log_inode_parent(trans, root, inode, parent, 1); -} - diff --git a/ANDROID_3.4.5/fs/btrfs/tree-log.h b/ANDROID_3.4.5/fs/btrfs/tree-log.h deleted file mode 100644 index 862ac813..00000000 --- a/ANDROID_3.4.5/fs/btrfs/tree-log.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __TREE_LOG_ -#define __TREE_LOG_ - -/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ -#define BTRFS_NO_LOG_SYNC 256 - -int btrfs_sync_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_recover_log_trees(struct btrfs_root *tree_root); -int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct dentry *dentry); -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct inode *dir, u64 index); -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct inode *inode, u64 dirid); -void btrfs_end_log_trans(struct btrfs_root *root); -int btrfs_pin_log_trans(struct btrfs_root *root); -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only); -void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, - struct inode *dir, struct inode *inode, - int for_rename); -int btrfs_log_new_name(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *old_dir, - struct dentry *parent); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/ulist.c b/ANDROID_3.4.5/fs/btrfs/ulist.c deleted file mode 100644 index 12f5147b..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ulist.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (C) 2011 STRATO AG - * written by Arne Jansen <sensille@gmx.net> - * Distributed under the GNU GPL license version 2. - */ - -#include <linux/slab.h> -#include <linux/module.h> -#include "ulist.h" - -/* - * ulist is a generic data structure to hold a collection of unique u64 - * values. The only operations it supports is adding to the list and - * enumerating it. - * It is possible to store an auxiliary value along with the key. - * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - * - * A sample usage for ulists is the enumeration of directed graphs without - * visiting a node twice. The pseudo-code could look like this: - * - * ulist = ulist_alloc(); - * ulist_add(ulist, root); - * elem = NULL; - * - * while ((elem = ulist_next(ulist, elem)) { - * for (all child nodes n in elem) - * ulist_add(ulist, n); - * do something useful with the node; - * } - * ulist_free(ulist); - * - * This assumes the graph nodes are adressable by u64. This stems from the - * usage for tree enumeration in btrfs, where the logical addresses are - * 64 bit. - * - * It is also useful for tree enumeration which could be done elegantly - * recursively, but is not possible due to kernel stack limitations. The - * loop would be similar to the above. - */ - -/** - * ulist_init - freshly initialize a ulist - * @ulist: the ulist to initialize - * - * Note: don't use this function to init an already used ulist, use - * ulist_reinit instead. - */ -void ulist_init(struct ulist *ulist) -{ - ulist->nnodes = 0; - ulist->nodes = ulist->int_nodes; - ulist->nodes_alloced = ULIST_SIZE; -} -EXPORT_SYMBOL(ulist_init); - -/** - * ulist_fini - free up additionally allocated memory for the ulist - * @ulist: the ulist from which to free the additional memory - * - * This is useful in cases where the base 'struct ulist' has been statically - * allocated. - */ -void ulist_fini(struct ulist *ulist) -{ - /* - * The first ULIST_SIZE elements are stored inline in struct ulist. - * Only if more elements are alocated they need to be freed. - */ - if (ulist->nodes_alloced > ULIST_SIZE) - kfree(ulist->nodes); - ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ -} -EXPORT_SYMBOL(ulist_fini); - -/** - * ulist_reinit - prepare a ulist for reuse - * @ulist: ulist to be reused - * - * Free up all additional memory allocated for the list elements and reinit - * the ulist. - */ -void ulist_reinit(struct ulist *ulist) -{ - ulist_fini(ulist); - ulist_init(ulist); -} -EXPORT_SYMBOL(ulist_reinit); - -/** - * ulist_alloc - dynamically allocate a ulist - * @gfp_mask: allocation flags to for base allocation - * - * The allocated ulist will be returned in an initialized state. - */ -struct ulist *ulist_alloc(unsigned long gfp_mask) -{ - struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); - - if (!ulist) - return NULL; - - ulist_init(ulist); - - return ulist; -} -EXPORT_SYMBOL(ulist_alloc); - -/** - * ulist_free - free dynamically allocated ulist - * @ulist: ulist to free - * - * It is not necessary to call ulist_fini before. - */ -void ulist_free(struct ulist *ulist) -{ - if (!ulist) - return; - ulist_fini(ulist); - kfree(ulist); -} -EXPORT_SYMBOL(ulist_free); - -/** - * ulist_add - add an element to the ulist - * @ulist: ulist to add the element to - * @val: value to add to ulist - * @aux: auxiliary value to store along with val - * @gfp_mask: flags to use for allocation - * - * Note: locking must be provided by the caller. In case of rwlocks write - * locking is needed - * - * Add an element to a ulist. The @val will only be added if it doesn't - * already exist. If it is added, the auxiliary value @aux is stored along with - * it. In case @val already exists in the ulist, @aux is ignored, even if - * it differs from the already stored value. - * - * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been - * inserted. - * In case of allocation failure -ENOMEM is returned and the ulist stays - * unaltered. - */ -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask) -{ - int i; - - for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) - return 0; - } - - if (ulist->nnodes >= ulist->nodes_alloced) { - u64 new_alloced = ulist->nodes_alloced + 128; - struct ulist_node *new_nodes; - void *old = NULL; - - /* - * if nodes_alloced == ULIST_SIZE no memory has been allocated - * yet, so pass NULL to krealloc - */ - if (ulist->nodes_alloced > ULIST_SIZE) - old = ulist->nodes; - - new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, - gfp_mask); - if (!new_nodes) - return -ENOMEM; - - if (!old) - memcpy(new_nodes, ulist->int_nodes, - sizeof(ulist->int_nodes)); - - ulist->nodes = new_nodes; - ulist->nodes_alloced = new_alloced; - } - ulist->nodes[ulist->nnodes].val = val; - ulist->nodes[ulist->nnodes].aux = aux; - ++ulist->nnodes; - - return 1; -} -EXPORT_SYMBOL(ulist_add); - -/** - * ulist_next - iterate ulist - * @ulist: ulist to iterate - * @prev: previously returned element or %NULL to start iteration - * - * Note: locking must be provided by the caller. In case of rwlocks only read - * locking is needed - * - * This function is used to iterate an ulist. The iteration is started with - * @prev = %NULL. It returns the next element from the ulist or %NULL when the - * end is reached. No guarantee is made with respect to the order in which - * the elements are returned. They might neither be returned in order of - * addition nor in ascending order. - * It is allowed to call ulist_add during an enumeration. Newly added items - * are guaranteed to show up in the running enumeration. - */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev) -{ - int next; - - if (ulist->nnodes == 0) - return NULL; - - if (!prev) - return &ulist->nodes[0]; - - next = (prev - ulist->nodes) + 1; - if (next < 0 || next >= ulist->nnodes) - return NULL; - - return &ulist->nodes[next]; -} -EXPORT_SYMBOL(ulist_next); diff --git a/ANDROID_3.4.5/fs/btrfs/ulist.h b/ANDROID_3.4.5/fs/btrfs/ulist.h deleted file mode 100644 index 2e25dec5..00000000 --- a/ANDROID_3.4.5/fs/btrfs/ulist.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (C) 2011 STRATO AG - * written by Arne Jansen <sensille@gmx.net> - * Distributed under the GNU GPL license version 2. - * - */ - -#ifndef __ULIST__ -#define __ULIST__ - -/* - * ulist is a generic data structure to hold a collection of unique u64 - * values. The only operations it supports is adding to the list and - * enumerating it. - * It is possible to store an auxiliary value along with the key. - * - * The implementation is preliminary and can probably be sped up - * significantly. A first step would be to store the values in an rbtree - * as soon as ULIST_SIZE is exceeded. - */ - -/* - * number of elements statically allocated inside struct ulist - */ -#define ULIST_SIZE 16 - -/* - * element of the list - */ -struct ulist_node { - u64 val; /* value to store */ - unsigned long aux; /* auxiliary value saved along with the val */ -}; - -struct ulist { - /* - * number of elements stored in list - */ - unsigned long nnodes; - - /* - * number of nodes we already have room for - */ - unsigned long nodes_alloced; - - /* - * pointer to the array storing the elements. The first ULIST_SIZE - * elements are stored inline. In this case the it points to int_nodes. - * After exceeding ULIST_SIZE, dynamic memory is allocated. - */ - struct ulist_node *nodes; - - /* - * inline storage space for the first ULIST_SIZE entries - */ - struct ulist_node int_nodes[ULIST_SIZE]; -}; - -void ulist_init(struct ulist *ulist); -void ulist_fini(struct ulist *ulist); -void ulist_reinit(struct ulist *ulist); -struct ulist *ulist_alloc(unsigned long gfp_mask); -void ulist_free(struct ulist *ulist); -int ulist_add(struct ulist *ulist, u64 val, unsigned long aux, - unsigned long gfp_mask); -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev); - -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/version.h b/ANDROID_3.4.5/fs/btrfs/version.h deleted file mode 100644 index 9bf3946d..00000000 --- a/ANDROID_3.4.5/fs/btrfs/version.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __BTRFS_VERSION_H -#define __BTRFS_VERSION_H -#define BTRFS_BUILD_VERSION "Btrfs" -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/volumes.c b/ANDROID_3.4.5/fs/btrfs/volumes.c deleted file mode 100644 index 1411b995..00000000 --- a/ANDROID_3.4.5/fs/btrfs/volumes.c +++ /dev/null @@ -1,4585 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ -#include <linux/sched.h> -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include <linux/random.h> -#include <linux/iocontext.h> -#include <linux/capability.h> -#include <linux/kthread.h> -#include <asm/div64.h> -#include "compat.h" -#include "ctree.h" -#include "extent_map.h" -#include "disk-io.h" -#include "transaction.h" -#include "print-tree.h" -#include "volumes.h" -#include "async-thread.h" -#include "check-integrity.h" - -static int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device); -static int btrfs_relocate_sys_chunks(struct btrfs_root *root); - -static DEFINE_MUTEX(uuid_mutex); -static LIST_HEAD(fs_uuids); - -static void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - -static void free_fs_devices(struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_device *device; - WARN_ON(fs_devices->opened); - while (!list_empty(&fs_devices->devices)) { - device = list_entry(fs_devices->devices.next, - struct btrfs_device, dev_list); - list_del(&device->dev_list); - kfree(device->name); - kfree(device); - } - kfree(fs_devices); -} - -void btrfs_cleanup_fs_uuids(void) -{ - struct btrfs_fs_devices *fs_devices; - - while (!list_empty(&fs_uuids)) { - fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, list); - list_del(&fs_devices->list); - free_fs_devices(fs_devices); - } -} - -static noinline struct btrfs_device *__find_device(struct list_head *head, - u64 devid, u8 *uuid) -{ - struct btrfs_device *dev; - - list_for_each_entry(dev, head, dev_list) { - if (dev->devid == devid && - (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { - return dev; - } - } - return NULL; -} - -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) -{ - struct btrfs_fs_devices *fs_devices; - - list_for_each_entry(fs_devices, &fs_uuids, list) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; - } - return NULL; -} - -static void requeue_list(struct btrfs_pending_bios *pending_bios, - struct bio *head, struct bio *tail) -{ - - struct bio *old_head; - - old_head = pending_bios->head; - pending_bios->head = head; - if (pending_bios->tail) - tail->bi_next = old_head; - else - pending_bios->tail = tail; -} - -/* - * we try to collect pending bios for a device so we don't get a large - * number of procs sending bios down to the same device. This greatly - * improves the schedulers ability to collect and merge the bios. - * - * But, it also turns into a long list of bios to process and that is sure - * to eventually make the worker thread block. The solution here is to - * make some progress and then put this work struct back at the end of - * the list if the block device is congested. This way, multiple devices - * can make progress from a single worker thread. - */ -static noinline void run_scheduled_bios(struct btrfs_device *device) -{ - struct bio *pending; - struct backing_dev_info *bdi; - struct btrfs_fs_info *fs_info; - struct btrfs_pending_bios *pending_bios; - struct bio *tail; - struct bio *cur; - int again = 0; - unsigned long num_run; - unsigned long batch_run = 0; - unsigned long limit; - unsigned long last_waited = 0; - int force_reg = 0; - int sync_pending = 0; - struct blk_plug plug; - - /* - * this function runs all the bios we've collected for - * a particular device. We don't want to wander off to - * another device without first sending all of these down. - * So, setup a plug here and finish it off before we return - */ - blk_start_plug(&plug); - - bdi = blk_get_backing_dev_info(device->bdev); - fs_info = device->dev_root->fs_info; - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; - -loop: - spin_lock(&device->io_lock); - -loop_lock: - num_run = 0; - - /* take all the bios off the list at once and process them - * later on (without the lock held). But, remember the - * tail and other pointers so the bios can be properly reinserted - * into the list if we hit congestion - */ - if (!force_reg && device->pending_sync_bios.head) { - pending_bios = &device->pending_sync_bios; - force_reg = 1; - } else { - pending_bios = &device->pending_bios; - force_reg = 0; - } - - pending = pending_bios->head; - tail = pending_bios->tail; - WARN_ON(pending && !tail); - - /* - * if pending was null this time around, no bios need processing - * at all and we can stop. Otherwise it'll loop back up again - * and do an additional check so no bios are missed. - * - * device->running_pending is used to synchronize with the - * schedule_bio code. - */ - if (device->pending_sync_bios.head == NULL && - device->pending_bios.head == NULL) { - again = 0; - device->running_pending = 0; - } else { - again = 1; - device->running_pending = 1; - } - - pending_bios->head = NULL; - pending_bios->tail = NULL; - - spin_unlock(&device->io_lock); - - while (pending) { - - rmb(); - /* we want to work on both lists, but do more bios on the - * sync list than the regular list - */ - if ((num_run > 32 && - pending_bios != &device->pending_sync_bios && - device->pending_sync_bios.head) || - (num_run > 64 && pending_bios == &device->pending_sync_bios && - device->pending_bios.head)) { - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - goto loop_lock; - } - - cur = pending; - pending = pending->bi_next; - cur->bi_next = NULL; - atomic_dec(&fs_info->nr_async_bios); - - if (atomic_read(&fs_info->nr_async_bios) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); - - BUG_ON(atomic_read(&cur->bi_cnt) == 0); - - /* - * if we're doing the sync list, record that our - * plug has some sync requests on it - * - * If we're doing the regular list and there are - * sync requests sitting around, unplug before - * we add more - */ - if (pending_bios == &device->pending_sync_bios) { - sync_pending = 1; - } else if (sync_pending) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } - - btrfsic_submit_bio(cur->bi_rw, cur); - num_run++; - batch_run++; - if (need_resched()) - cond_resched(); - - /* - * we made progress, there is more work to do and the bdi - * is now congested. Back off and let other work structs - * run instead - */ - if (pending && bdi_write_congested(bdi) && batch_run > 8 && - fs_info->fs_devices->open_devices > 1) { - struct io_context *ioc; - - ioc = current->io_context; - - /* - * the main goal here is that we don't want to - * block if we're going to be able to submit - * more requests without blocking. - * - * This code does two great things, it pokes into - * the elevator code from a filesystem _and_ - * it makes assumptions about how batching works. - */ - if (ioc && ioc->nr_batch_requests > 0 && - time_before(jiffies, ioc->last_waited + HZ/50UL) && - (last_waited == 0 || - ioc->last_waited == last_waited)) { - /* - * we want to go through our batch of - * requests and stop. So, we copy out - * the ioc->last_waited time and test - * against it before looping - */ - last_waited = ioc->last_waited; - if (need_resched()) - cond_resched(); - continue; - } - spin_lock(&device->io_lock); - requeue_list(pending_bios, pending, tail); - device->running_pending = 1; - - spin_unlock(&device->io_lock); - btrfs_requeue_work(&device->work); - goto done; - } - /* unplug every 64 requests just for good measure */ - if (batch_run % 64 == 0) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } - } - - cond_resched(); - if (again) - goto loop; - - spin_lock(&device->io_lock); - if (device->pending_bios.head || device->pending_sync_bios.head) - goto loop_lock; - spin_unlock(&device->io_lock); - -done: - blk_finish_plug(&plug); -} - -static void pending_bios_fn(struct btrfs_work *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, work); - run_scheduled_bios(device); -} - -static noinline int device_list_add(const char *path, - struct btrfs_super_block *disk_super, - u64 devid, struct btrfs_fs_devices **fs_devices_ret) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices; - u64 found_transid = btrfs_super_generation(disk_super); - char *name; - - fs_devices = find_fsid(disk_super->fsid); - if (!fs_devices) { - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return -ENOMEM; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); - list_add(&fs_devices->list, &fs_uuids); - memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - fs_devices->latest_devid = devid; - fs_devices->latest_trans = found_transid; - mutex_init(&fs_devices->device_list_mutex); - device = NULL; - } else { - device = __find_device(&fs_devices->devices, devid, - disk_super->dev_item.uuid); - } - if (!device) { - if (fs_devices->opened) - return -EBUSY; - - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { - /* we can safely leave the fs_devices entry around */ - return -ENOMEM; - } - device->devid = devid; - device->work.func = pending_bios_fn; - memcpy(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE); - spin_lock_init(&device->io_lock); - device->name = kstrdup(path, GFP_NOFS); - if (!device->name) { - kfree(device); - return -ENOMEM; - } - INIT_LIST_HEAD(&device->dev_alloc_list); - - /* init readahead state */ - spin_lock_init(&device->reada_lock); - device->reada_curr_zone = NULL; - atomic_set(&device->reada_in_flight, 0); - device->reada_next = 0; - INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); - - mutex_lock(&fs_devices->device_list_mutex); - list_add_rcu(&device->dev_list, &fs_devices->devices); - mutex_unlock(&fs_devices->device_list_mutex); - - device->fs_devices = fs_devices; - fs_devices->num_devices++; - } else if (!device->name || strcmp(device->name, path)) { - name = kstrdup(path, GFP_NOFS); - if (!name) - return -ENOMEM; - kfree(device->name); - device->name = name; - if (device->missing) { - fs_devices->missing_devices--; - device->missing = 0; - } - } - - if (found_transid > fs_devices->latest_trans) { - fs_devices->latest_devid = devid; - fs_devices->latest_trans = found_transid; - } - *fs_devices_ret = fs_devices; - return 0; -} - -static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) -{ - struct btrfs_fs_devices *fs_devices; - struct btrfs_device *device; - struct btrfs_device *orig_dev; - - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); - INIT_LIST_HEAD(&fs_devices->list); - mutex_init(&fs_devices->device_list_mutex); - fs_devices->latest_devid = orig->latest_devid; - fs_devices->latest_trans = orig->latest_trans; - memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); - - /* We have held the volume lock, it is safe to get the devices. */ - list_for_each_entry(orig_dev, &orig->devices, dev_list) { - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) - goto error; - - device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) { - kfree(device); - goto error; - } - - device->devid = orig_dev->devid; - device->work.func = pending_bios_fn; - memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_list); - INIT_LIST_HEAD(&device->dev_alloc_list); - - list_add(&device->dev_list, &fs_devices->devices); - device->fs_devices = fs_devices; - fs_devices->num_devices++; - } - return fs_devices; -error: - free_fs_devices(fs_devices); - return ERR_PTR(-ENOMEM); -} - -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_device *device, *next; - - struct block_device *latest_bdev = NULL; - u64 latest_devid = 0; - u64 latest_transid = 0; - - mutex_lock(&uuid_mutex); -again: - /* This is the initialized path, it is safe to release the devices. */ - list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) { - if (!latest_transid || - device->generation > latest_transid) { - latest_devid = device->devid; - latest_transid = device->generation; - latest_bdev = device->bdev; - } - continue; - } - - if (device->bdev) { - blkdev_put(device->bdev, device->mode); - device->bdev = NULL; - fs_devices->open_devices--; - } - if (device->writeable) { - list_del_init(&device->dev_alloc_list); - device->writeable = 0; - fs_devices->rw_devices--; - } - list_del_init(&device->dev_list); - fs_devices->num_devices--; - kfree(device->name); - kfree(device); - } - - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } - - fs_devices->latest_bdev = latest_bdev; - fs_devices->latest_devid = latest_devid; - fs_devices->latest_trans = latest_transid; - - mutex_unlock(&uuid_mutex); -} - -static void __free_device(struct work_struct *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, rcu_work); - - if (device->bdev) - blkdev_put(device->bdev, device->mode); - - kfree(device->name); - kfree(device); -} - -static void free_device(struct rcu_head *head) -{ - struct btrfs_device *device; - - device = container_of(head, struct btrfs_device, rcu); - - INIT_WORK(&device->rcu_work, __free_device); - schedule_work(&device->rcu_work); -} - -static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_device *device; - - if (--fs_devices->opened > 0) - return 0; - - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - struct btrfs_device *new_device; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->can_discard) - fs_devices->num_can_discard--; - - new_device = kmalloc(sizeof(*new_device), GFP_NOFS); - BUG_ON(!new_device); /* -ENOMEM */ - memcpy(new_device, device, sizeof(*new_device)); - new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(device->name && !new_device->name); /* -ENOMEM */ - new_device->bdev = NULL; - new_device->writeable = 0; - new_device->in_fs_metadata = 0; - new_device->can_discard = 0; - list_replace_rcu(&device->dev_list, &new_device->dev_list); - - call_rcu(&device->rcu, free_device); - } - mutex_unlock(&fs_devices->device_list_mutex); - - WARN_ON(fs_devices->open_devices); - WARN_ON(fs_devices->rw_devices); - fs_devices->opened = 0; - fs_devices->seeding = 0; - - return 0; -} - -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_fs_devices *seed_devices = NULL; - int ret; - - mutex_lock(&uuid_mutex); - ret = __btrfs_close_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } - mutex_unlock(&uuid_mutex); - - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; - __btrfs_close_devices(fs_devices); - free_fs_devices(fs_devices); - } - return ret; -} - -static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder) -{ - struct request_queue *q; - struct block_device *bdev; - struct list_head *head = &fs_devices->devices; - struct btrfs_device *device; - struct block_device *latest_bdev = NULL; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - u64 latest_devid = 0; - u64 latest_transid = 0; - u64 devid; - int seeding = 1; - int ret = 0; - - flags |= FMODE_EXCL; - - list_for_each_entry(device, head, dev_list) { - if (device->bdev) - continue; - if (!device->name) - continue; - - bdev = blkdev_get_by_path(device->name, flags, holder); - if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name); - goto error; - } - filemap_write_and_wait(bdev->bd_inode->i_mapping); - invalidate_bdev(bdev); - set_blocksize(bdev, 4096); - - bh = btrfs_read_dev_super(bdev); - if (!bh) - goto error_close; - - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - if (devid != device->devid) - goto error_brelse; - - if (memcmp(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE)) - goto error_brelse; - - device->generation = btrfs_super_generation(disk_super); - if (!latest_transid || device->generation > latest_transid) { - latest_devid = devid; - latest_transid = device->generation; - latest_bdev = bdev; - } - - if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { - device->writeable = 0; - } else { - device->writeable = !bdev_read_only(bdev); - seeding = 0; - } - - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) { - device->can_discard = 1; - fs_devices->num_can_discard++; - } - - device->bdev = bdev; - device->in_fs_metadata = 0; - device->mode = flags; - - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - - fs_devices->open_devices++; - if (device->writeable) { - fs_devices->rw_devices++; - list_add(&device->dev_alloc_list, - &fs_devices->alloc_list); - } - brelse(bh); - continue; - -error_brelse: - brelse(bh); -error_close: - blkdev_put(bdev, flags); -error: - continue; - } - if (fs_devices->open_devices == 0) { - ret = -EINVAL; - goto out; - } - fs_devices->seeding = seeding; - fs_devices->opened = 1; - fs_devices->latest_bdev = latest_bdev; - fs_devices->latest_devid = latest_devid; - fs_devices->latest_trans = latest_transid; - fs_devices->total_rw_bytes = 0; -out: - return ret; -} - -int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder) -{ - int ret; - - mutex_lock(&uuid_mutex); - if (fs_devices->opened) { - fs_devices->opened++; - ret = 0; - } else { - ret = __btrfs_open_devices(fs_devices, flags, holder); - } - mutex_unlock(&uuid_mutex); - return ret; -} - -int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, - struct btrfs_fs_devices **fs_devices_ret) -{ - struct btrfs_super_block *disk_super; - struct block_device *bdev; - struct buffer_head *bh; - int ret; - u64 devid; - u64 transid; - - flags |= FMODE_EXCL; - bdev = blkdev_get_by_path(path, flags, holder); - - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto error; - } - - mutex_lock(&uuid_mutex); - ret = set_blocksize(bdev, 4096); - if (ret) - goto error_close; - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - transid = btrfs_super_generation(disk_super); - if (disk_super->label[0]) - printk(KERN_INFO "device label %s ", disk_super->label); - else - printk(KERN_INFO "device fsid %pU ", disk_super->fsid); - printk(KERN_CONT "devid %llu transid %llu %s\n", - (unsigned long long)devid, (unsigned long long)transid, path); - ret = device_list_add(path, disk_super, devid, fs_devices_ret); - - brelse(bh); -error_close: - mutex_unlock(&uuid_mutex); - blkdev_put(bdev, flags); -error: - return ret; -} - -/* helper to account the used device space in the range */ -int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, - u64 end, u64 *length) -{ - struct btrfs_key key; - struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent; - struct btrfs_path *path; - u64 extent_end; - int ret; - int slot; - struct extent_buffer *l; - - *length = 0; - - if (start >= device->total_bytes) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - - key.objectid = device->devid; - key.offset = start; - key.type = BTRFS_DEV_EXTENT_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = btrfs_previous_item(root, path, key.objectid, key.type); - if (ret < 0) - goto out; - } - - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid < device->devid) - goto next; - - if (key.objectid > device->devid) - break; - - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - - dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - extent_end = key.offset + btrfs_dev_extent_length(l, - dev_extent); - if (key.offset <= start && extent_end > end) { - *length = end - start + 1; - break; - } else if (key.offset <= start && extent_end > start) - *length += extent_end - start; - else if (key.offset > start && extent_end <= end) - *length += extent_end - key.offset; - else if (key.offset > start && key.offset <= end) { - *length += end - key.offset + 1; - break; - } else if (key.offset > end) - break; - -next: - path->slots[0]++; - } - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -/* - * find_free_dev_extent - find free space in the specified device - * @device: the device which we search the free space in - * @num_bytes: the size of the free space that we need - * @start: store the start of the free space. - * @len: the size of the free space. that we find, or the size of the max - * free space if we don't find suitable free space - * - * this uses a pretty simple search, the expectation is that it is - * called very infrequently and that a given device has a small number - * of extents - * - * @start is used to store the start of the free space if we find. But if we - * don't find suitable free space, it will be used to store the start position - * of the max free space. - * - * @len is used to store the size of the free space that we find. - * But if we don't find suitable free space, it is used to store the size of - * the max free space. - */ -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) -{ - struct btrfs_key key; - struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent; - struct btrfs_path *path; - u64 hole_size; - u64 max_hole_start; - u64 max_hole_size; - u64 extent_end; - u64 search_start; - u64 search_end = device->total_bytes; - int ret; - int slot; - struct extent_buffer *l; - - /* FIXME use last free of some kind */ - - /* we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); - - max_hole_start = search_start; - max_hole_size = 0; - hole_size = 0; - - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto error; - } - path->reada = 2; - - key.objectid = device->devid; - key.offset = search_start; - key.type = BTRFS_DEV_EXTENT_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - if (ret > 0) { - ret = btrfs_previous_item(root, path, key.objectid, key.type); - if (ret < 0) - goto out; - } - - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid < device->devid) - goto next; - - if (key.objectid > device->devid) - break; - - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - - if (key.offset > search_start) { - hole_size = key.offset - search_start; - - if (hole_size > max_hole_size) { - max_hole_start = search_start; - max_hole_size = hole_size; - } - - /* - * If this free space is greater than which we need, - * it must be the max free space that we have found - * until now, so max_hole_start must point to the start - * of this free space and the length of this free space - * is stored in max_hole_size. Thus, we return - * max_hole_start and max_hole_size and go back to the - * caller. - */ - if (hole_size >= num_bytes) { - ret = 0; - goto out; - } - } - - dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - extent_end = key.offset + btrfs_dev_extent_length(l, - dev_extent); - if (extent_end > search_start) - search_start = extent_end; -next: - path->slots[0]++; - cond_resched(); - } - - /* - * At this point, search_start should be the end of - * allocated dev extents, and when shrinking the device, - * search_end may be smaller than search_start. - */ - if (search_end > search_start) - hole_size = search_end - search_start; - - if (hole_size > max_hole_size) { - max_hole_start = search_start; - max_hole_size = hole_size; - } - - /* See above. */ - if (hole_size < num_bytes) - ret = -ENOSPC; - else - ret = 0; - -out: - btrfs_free_path(path); -error: - *start = max_hole_start; - if (len) - *len = max_hole_size; - return ret; -} - -static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 start) -{ - int ret; - struct btrfs_path *path; - struct btrfs_root *root = device->dev_root; - struct btrfs_key key; - struct btrfs_key found_key; - struct extent_buffer *leaf = NULL; - struct btrfs_dev_extent *extent = NULL; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = device->devid; - key.offset = start; - key.type = BTRFS_DEV_EXTENT_KEY; -again: - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) { - ret = btrfs_previous_item(root, path, key.objectid, - BTRFS_DEV_EXTENT_KEY); - if (ret) - goto out; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_dev_extent); - BUG_ON(found_key.offset > start || found_key.offset + - btrfs_dev_extent_length(leaf, extent) < start); - key = found_key; - btrfs_release_path(path); - goto again; - } else if (ret == 0) { - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_dev_extent); - } else { - btrfs_error(root->fs_info, ret, "Slot search failed"); - goto out; - } - - if (device->bytes_used > 0) { - u64 len = btrfs_dev_extent_length(leaf, extent); - device->bytes_used -= len; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += len; - spin_unlock(&root->fs_info->free_chunk_lock); - } - ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_error(root->fs_info, ret, - "Failed to remove dev extent item"); - } -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes) -{ - int ret; - struct btrfs_path *path; - struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *extent; - struct extent_buffer *leaf; - struct btrfs_key key; - - WARN_ON(!device->in_fs_metadata); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = device->devid; - key.offset = start; - key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) - goto out; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_dev_extent); - btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); - btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); - btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), - BTRFS_UUID_SIZE); - - btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return ret; -} - -static noinline int find_next_chunk(struct btrfs_root *root, - u64 objectid, u64 *offset) -{ - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct btrfs_chunk *chunk; - struct btrfs_key found_key; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = objectid; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - - BUG_ON(ret == 0); /* Corruption */ - - ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); - if (ret) { - *offset = 0; - } else { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != objectid) - *offset = 0; - else { - chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_chunk); - *offset = found_key.offset + - btrfs_chunk_length(path->nodes[0], chunk); - } - } - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) -{ - int ret; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_path *path; - - root = root->fs_info->chunk_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.type = BTRFS_DEV_ITEM_KEY; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - - BUG_ON(ret == 0); /* Corruption */ - - ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, - BTRFS_DEV_ITEM_KEY); - if (ret) { - *objectid = 1; - } else { - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - *objectid = found_key.offset + 1; - } - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -/* - * the device information is stored in the chunk root - * the btrfs_device struct should be fully filled in - */ -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device) -{ - int ret; - struct btrfs_path *path; - struct btrfs_dev_item *dev_item; - struct extent_buffer *leaf; - struct btrfs_key key; - unsigned long ptr; - - root = root->fs_info->chunk_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.type = BTRFS_DEV_ITEM_KEY; - key.offset = device->devid; - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*dev_item)); - if (ret) - goto out; - - leaf = path->nodes[0]; - dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); - - btrfs_set_device_id(leaf, dev_item, device->devid); - btrfs_set_device_generation(leaf, dev_item, 0); - btrfs_set_device_type(leaf, dev_item, device->type); - btrfs_set_device_io_align(leaf, dev_item, device->io_align); - btrfs_set_device_io_width(leaf, dev_item, device->io_width); - btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); - btrfs_set_device_group(leaf, dev_item, 0); - btrfs_set_device_seek_speed(leaf, dev_item, 0); - btrfs_set_device_bandwidth(leaf, dev_item, 0); - btrfs_set_device_start_offset(leaf, dev_item, 0); - - ptr = (unsigned long)btrfs_device_uuid(dev_item); - write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - ptr = (unsigned long)btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); - btrfs_mark_buffer_dirty(leaf); - - ret = 0; -out: - btrfs_free_path(path); - return ret; -} - -static int btrfs_rm_dev_item(struct btrfs_root *root, - struct btrfs_device *device) -{ - int ret; - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_trans_handle *trans; - - root = root->fs_info->chunk_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.type = BTRFS_DEV_ITEM_KEY; - key.offset = device->devid; - lock_chunks(root); - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; -out: - btrfs_free_path(path); - unlock_chunks(root); - btrfs_commit_transaction(trans, root); - return ret; -} - -int btrfs_rm_device(struct btrfs_root *root, char *device_path) -{ - struct btrfs_device *device; - struct btrfs_device *next_device; - struct block_device *bdev; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; - struct btrfs_fs_devices *cur_devices; - u64 all_avail; - u64 devid; - u64 num_devices; - u8 *dev_uuid; - int ret = 0; - bool clear_super = false; - - mutex_lock(&uuid_mutex); - - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; - goto out; - } - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); - ret = -EINVAL; - goto out; - } - - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - device = NULL; - devices = &root->fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held. - */ - list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && !tmp->bdev) { - device = tmp; - break; - } - } - bdev = NULL; - bh = NULL; - disk_super = NULL; - if (!device) { - printk(KERN_ERR "btrfs: no missing devices found to " - "remove\n"); - goto out; - } - } else { - bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, - root->fs_info->bdev_holder); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); - goto out; - } - - set_blocksize(bdev, 4096); - invalidate_bdev(bdev); - bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; - goto error_close; - } - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root, devid, dev_uuid, - disk_super->fsid); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } - } - - if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { - printk(KERN_ERR "btrfs: unable to remove the only writeable " - "device\n"); - ret = -EINVAL; - goto error_brelse; - } - - if (device->writeable) { - lock_chunks(root); - list_del_init(&device->dev_alloc_list); - unlock_chunks(root); - root->fs_info->fs_devices->rw_devices--; - clear_super = true; - } - - ret = btrfs_shrink_device(device, 0); - if (ret) - goto error_undo; - - ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); - if (ret) - goto error_undo; - - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space = device->total_bytes - - device->bytes_used; - spin_unlock(&root->fs_info->free_chunk_lock); - - device->in_fs_metadata = 0; - btrfs_scrub_cancel_dev(root, device); - - /* - * the device list mutex makes sure that we don't change - * the device list while someone else is writing out all - * the device supers. - */ - - cur_devices = device->fs_devices; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - list_del_rcu(&device->dev_list); - - device->fs_devices->num_devices--; - - if (device->missing) - root->fs_info->fs_devices->missing_devices--; - - next_device = list_entry(root->fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (device->bdev == root->fs_info->sb->s_bdev) - root->fs_info->sb->s_bdev = next_device->bdev; - if (device->bdev == root->fs_info->fs_devices->latest_bdev) - root->fs_info->fs_devices->latest_bdev = next_device->bdev; - - if (device->bdev) - device->fs_devices->open_devices--; - - call_rcu(&device->rcu, free_device); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - - num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); - - if (cur_devices->open_devices == 0) { - struct btrfs_fs_devices *fs_devices; - fs_devices = root->fs_info->fs_devices; - while (fs_devices) { - if (fs_devices->seed == cur_devices) - break; - fs_devices = fs_devices->seed; - } - fs_devices->seed = cur_devices->seed; - cur_devices->seed = NULL; - lock_chunks(root); - __btrfs_close_devices(cur_devices); - unlock_chunks(root); - free_fs_devices(cur_devices); - } - - /* - * at this point, the device is zero sized. We want to - * remove it from the devices list and zero out the old super - */ - if (clear_super) { - /* make sure this device isn't detected as part of - * the FS anymore - */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - } - - ret = 0; - -error_brelse: - brelse(bh); -error_close: - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); -out: - mutex_unlock(&uuid_mutex); - return ret; -error_undo: - if (device->writeable) { - lock_chunks(root); - list_add(&device->dev_alloc_list, - &root->fs_info->fs_devices->alloc_list); - unlock_chunks(root); - root->fs_info->fs_devices->rw_devices++; - } - goto error_brelse; -} - -/* - * does all the dirty work required for changing file system's UUID. - */ -static int btrfs_prepare_sprout(struct btrfs_root *root) -{ - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - struct btrfs_fs_devices *old_devices; - struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = root->fs_info->super_copy; - struct btrfs_device *device; - u64 super_flags; - - BUG_ON(!mutex_is_locked(&uuid_mutex)); - if (!fs_devices->seeding) - return -EINVAL; - - seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!seed_devices) - return -ENOMEM; - - old_devices = clone_fs_devices(fs_devices); - if (IS_ERR(old_devices)) { - kfree(seed_devices); - return PTR_ERR(old_devices); - } - - list_add(&old_devices->list, &fs_uuids); - - memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); - seed_devices->opened = 1; - INIT_LIST_HEAD(&seed_devices->devices); - INIT_LIST_HEAD(&seed_devices->alloc_list); - mutex_init(&seed_devices->device_list_mutex); - - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, - synchronize_rcu); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - list_for_each_entry(device, &seed_devices->devices, dev_list) { - device->fs_devices = seed_devices; - } - - fs_devices->seeding = 0; - fs_devices->num_devices = 0; - fs_devices->open_devices = 0; - fs_devices->seed = seed_devices; - - generate_random_uuid(fs_devices->fsid); - memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - super_flags = btrfs_super_flags(disk_super) & - ~BTRFS_SUPER_FLAG_SEEDING; - btrfs_set_super_flags(disk_super, super_flags); - - return 0; -} - -/* - * strore the expected generation for seed devices in device items. - */ -static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_dev_item *dev_item; - struct btrfs_device *device; - struct btrfs_key key; - u8 fs_uuid[BTRFS_UUID_SIZE]; - u8 dev_uuid[BTRFS_UUID_SIZE]; - u64 devid; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - root = root->fs_info->chunk_root; - key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; - key.type = BTRFS_DEV_ITEM_KEY; - - while (1) { - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto error; - - leaf = path->nodes[0]; -next_slot: - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret > 0) - break; - if (ret < 0) - goto error; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - btrfs_release_path(path); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || - key.type != BTRFS_DEV_ITEM_KEY) - break; - - dev_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_dev_item); - devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), - BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), - BTRFS_UUID_SIZE); - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); - BUG_ON(!device); /* Logic error */ - - if (device->fs_devices->seeding) { - btrfs_set_device_generation(leaf, dev_item, - device->generation); - btrfs_mark_buffer_dirty(leaf); - } - - path->slots[0]++; - goto next_slot; - } - ret = 0; -error: - btrfs_free_path(path); - return ret; -} - -int btrfs_init_new_device(struct btrfs_root *root, char *device_path) -{ - struct request_queue *q; - struct btrfs_trans_handle *trans; - struct btrfs_device *device; - struct block_device *bdev; - struct list_head *devices; - struct super_block *sb = root->fs_info->sb; - u64 total_bytes; - int seeding_dev = 0; - int ret = 0; - - if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) - return -EINVAL; - - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - - if (root->fs_info->fs_devices->seeding) { - seeding_dev = 1; - down_write(&sb->s_umount); - mutex_lock(&uuid_mutex); - } - - filemap_write_and_wait(bdev->bd_inode->i_mapping); - - devices = &root->fs_info->fs_devices->devices; - /* - * we have the volume lock, so we don't need the extra - * device list mutex while reading the list here. - */ - list_for_each_entry(device, devices, dev_list) { - if (device->bdev == bdev) { - ret = -EEXIST; - goto error; - } - } - - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { - /* we can safely leave the fs_devices entry around */ - ret = -ENOMEM; - goto error; - } - - device->name = kstrdup(device_path, GFP_NOFS); - if (!device->name) { - kfree(device); - ret = -ENOMEM; - goto error; - } - - ret = find_next_devid(root, &device->devid); - if (ret) { - kfree(device->name); - kfree(device); - goto error; - } - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - kfree(device->name); - kfree(device); - ret = PTR_ERR(trans); - goto error; - } - - lock_chunks(root); - - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - device->writeable = 1; - device->work.func = pending_bios_fn; - generate_random_uuid(device->uuid); - spin_lock_init(&device->io_lock); - device->generation = trans->transid; - device->io_width = root->sectorsize; - device->io_align = root->sectorsize; - device->sector_size = root->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); - device->disk_total_bytes = device->total_bytes; - device->dev_root = root->fs_info->dev_root; - device->bdev = bdev; - device->in_fs_metadata = 1; - device->mode = FMODE_EXCL; - set_blocksize(device->bdev, 4096); - - if (seeding_dev) { - sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(root); - BUG_ON(ret); /* -ENOMEM */ - } - - device->fs_devices = root->fs_info->fs_devices; - - /* - * we don't want write_supers to jump in here with our device - * half setup - */ - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); - list_add(&device->dev_alloc_list, - &root->fs_info->fs_devices->alloc_list); - root->fs_info->fs_devices->num_devices++; - root->fs_info->fs_devices->open_devices++; - root->fs_info->fs_devices->rw_devices++; - if (device->can_discard) - root->fs_info->fs_devices->num_can_discard++; - root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; - - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&root->fs_info->free_chunk_lock); - - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - root->fs_info->fs_devices->rotating = 1; - - total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); - btrfs_set_super_total_bytes(root->fs_info->super_copy, - total_bytes + device->total_bytes); - - total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); - btrfs_set_super_num_devices(root->fs_info->super_copy, - total_bytes + 1); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - - if (seeding_dev) { - ret = init_first_rw_device(trans, root, device); - if (ret) - goto error_trans; - ret = btrfs_finish_sprout(trans, root); - if (ret) - goto error_trans; - } else { - ret = btrfs_add_device(trans, root, device); - if (ret) - goto error_trans; - } - - /* - * we've got more storage, clear any full flags on the space - * infos - */ - btrfs_clear_space_info_full(root->fs_info); - - unlock_chunks(root); - ret = btrfs_commit_transaction(trans, root); - - if (seeding_dev) { - mutex_unlock(&uuid_mutex); - up_write(&sb->s_umount); - - if (ret) /* transaction commit */ - return ret; - - ret = btrfs_relocate_sys_chunks(root); - if (ret < 0) - btrfs_error(root->fs_info, ret, - "Failed to relocate sys chunks after " - "device initialization. This can be fixed " - "using the \"btrfs balance\" command."); - } - - return ret; - -error_trans: - unlock_chunks(root); - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - kfree(device->name); - kfree(device); -error: - blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev) { - mutex_unlock(&uuid_mutex); - up_write(&sb->s_umount); - } - return ret; -} - -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device) -{ - int ret; - struct btrfs_path *path; - struct btrfs_root *root; - struct btrfs_dev_item *dev_item; - struct extent_buffer *leaf; - struct btrfs_key key; - - root = device->dev_root->fs_info->chunk_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.type = BTRFS_DEV_ITEM_KEY; - key.offset = device->devid; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - leaf = path->nodes[0]; - dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); - - btrfs_set_device_id(leaf, dev_item, device->devid); - btrfs_set_device_type(leaf, dev_item, device->type); - btrfs_set_device_io_align(leaf, dev_item, device->io_align); - btrfs_set_device_io_width(leaf, dev_item, device->io_width); - btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); - btrfs_mark_buffer_dirty(leaf); - -out: - btrfs_free_path(path); - return ret; -} - -static int __btrfs_grow_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 new_size) -{ - struct btrfs_super_block *super_copy = - device->dev_root->fs_info->super_copy; - u64 old_total = btrfs_super_total_bytes(super_copy); - u64 diff = new_size - device->total_bytes; - - if (!device->writeable) - return -EACCES; - if (new_size <= device->total_bytes) - return -EINVAL; - - btrfs_set_super_total_bytes(super_copy, old_total + diff); - device->fs_devices->total_rw_bytes += diff; - - device->total_bytes = new_size; - device->disk_total_bytes = new_size; - btrfs_clear_space_info_full(device->dev_root->fs_info); - - return btrfs_update_device(trans, device); -} - -int btrfs_grow_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 new_size) -{ - int ret; - lock_chunks(device->dev_root); - ret = __btrfs_grow_device(trans, device, new_size); - unlock_chunks(device->dev_root); - return ret; -} - -static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) -{ - int ret; - struct btrfs_path *path; - struct btrfs_key key; - - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = chunk_objectid; - key.offset = chunk_offset; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - else if (ret > 0) { /* Logic error or corruption */ - btrfs_error(root->fs_info, -ENOENT, - "Failed lookup while freeing chunk."); - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); - if (ret < 0) - btrfs_error(root->fs_info, ret, - "Failed to delete chunk item."); -out: - btrfs_free_path(path); - return ret; -} - -static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 - chunk_offset) -{ - struct btrfs_super_block *super_copy = root->fs_info->super_copy; - struct btrfs_disk_key *disk_key; - struct btrfs_chunk *chunk; - u8 *ptr; - int ret = 0; - u32 num_stripes; - u32 array_size; - u32 len = 0; - u32 cur; - struct btrfs_key key; - - array_size = btrfs_super_sys_array_size(super_copy); - - ptr = super_copy->sys_chunk_array; - cur = 0; - - while (cur < array_size) { - disk_key = (struct btrfs_disk_key *)ptr; - btrfs_disk_key_to_cpu(&key, disk_key); - - len = sizeof(*disk_key); - - if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)(ptr + len); - num_stripes = btrfs_stack_chunk_num_stripes(chunk); - len += btrfs_chunk_item_size(num_stripes); - } else { - ret = -EIO; - break; - } - if (key.objectid == chunk_objectid && - key.offset == chunk_offset) { - memmove(ptr, ptr + len, array_size - (cur + len)); - array_size -= len; - btrfs_set_super_sys_array_size(super_copy, array_size); - } else { - ptr += len; - cur += len; - } - } - return ret; -} - -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) -{ - struct extent_map_tree *em_tree; - struct btrfs_root *extent_root; - struct btrfs_trans_handle *trans; - struct extent_map *em; - struct map_lookup *map; - int ret; - int i; - - root = root->fs_info->chunk_root; - extent_root = root->fs_info->extent_root; - em_tree = &root->fs_info->mapping_tree.map_tree; - - ret = btrfs_can_relocate(extent_root, chunk_offset); - if (ret) - return -ENOSPC; - - /* step one, relocate all the extents inside this chunk */ - ret = btrfs_relocate_block_group(extent_root, chunk_offset); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); - - lock_chunks(root); - - /* - * step two, delete the device extents and the - * chunk tree entries - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - BUG_ON(!em || em->start > chunk_offset || - em->start + em->len < chunk_offset); - map = (struct map_lookup *)em->bdev; - - for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, - map->stripes[i].physical); - BUG_ON(ret); - - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - BUG_ON(ret); - } - } - ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, - chunk_offset); - - BUG_ON(ret); - - trace_btrfs_chunk_free(root, map, chunk_offset, em->len); - - if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); - BUG_ON(ret); - } - - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); - BUG_ON(ret); - - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - kfree(map); - em->bdev = NULL; - - /* once for the tree */ - free_extent_map(em); - /* once for us */ - free_extent_map(em); - - unlock_chunks(root); - btrfs_end_transaction(trans, root); - return 0; -} - -static int btrfs_relocate_sys_chunks(struct btrfs_root *root) -{ - struct btrfs_root *chunk_root = root->fs_info->chunk_root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_chunk *chunk; - struct btrfs_key key; - struct btrfs_key found_key; - u64 chunk_tree = chunk_root->root_key.objectid; - u64 chunk_type; - bool retried = false; - int failed = 0; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - -again: - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - while (1) { - ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) - goto error; - BUG_ON(ret == 0); /* Corruption */ - - ret = btrfs_previous_item(chunk_root, path, key.objectid, - key.type); - if (ret < 0) - goto error; - if (ret > 0) - break; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - chunk = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_chunk); - chunk_type = btrfs_chunk_type(leaf, chunk); - btrfs_release_path(path); - - if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(chunk_root, chunk_tree, - found_key.objectid, - found_key.offset); - if (ret == -ENOSPC) - failed++; - else if (ret) - BUG(); - } - - if (found_key.offset == 0) - break; - key.offset = found_key.offset - 1; - } - ret = 0; - if (failed && !retried) { - failed = 0; - retried = true; - goto again; - } else if (failed && retried) { - WARN_ON(1); - ret = -ENOSPC; - } -error: - btrfs_free_path(path); - return ret; -} - -static int insert_balance_item(struct btrfs_root *root, - struct btrfs_balance_control *bctl) -{ - struct btrfs_trans_handle *trans; - struct btrfs_balance_item *item; - struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - int ret, err; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - - key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*item)); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - - btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); - btrfs_set_balance_data(leaf, item, &disk_bargs); - btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); - btrfs_set_balance_meta(leaf, item, &disk_bargs); - btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); - btrfs_set_balance_sys(leaf, item, &disk_bargs); - - btrfs_set_balance_flags(leaf, item, bctl->flags); - - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - err = btrfs_commit_transaction(trans, root); - if (err && !ret) - ret = err; - return ret; -} - -static int del_balance_item(struct btrfs_root *root) -{ - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_key key; - int ret, err; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } - - key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; - } - - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - err = btrfs_commit_transaction(trans, root); - if (err && !ret) - ret = err; - return ret; -} - -/* - * This is a heuristic used to reduce the number of chunks balanced on - * resume after balance was interrupted. - */ -static void update_balance_args(struct btrfs_balance_control *bctl) -{ - /* - * Turn on soft mode for chunk types that were being converted. - */ - if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) - bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) - bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; - if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) - bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; - - /* - * Turn on usage filter if is not already used. The idea is - * that chunks that we have already balanced should be - * reasonably full. Don't do it for chunks that are being - * converted - that will keep us from relocating unconverted - * (albeit full) chunks. - */ - if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && - !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { - bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; - bctl->data.usage = 90; - } - if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && - !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { - bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; - bctl->sys.usage = 90; - } - if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && - !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { - bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; - bctl->meta.usage = 90; - } -} - -/* - * Should be called with both balance and volume mutexes held to - * serialize other volume operations (add_dev/rm_dev/resize) with - * restriper. Same goes for unset_balance_control. - */ -static void set_balance_control(struct btrfs_balance_control *bctl) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - - BUG_ON(fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = bctl; - spin_unlock(&fs_info->balance_lock); -} - -static void unset_balance_control(struct btrfs_fs_info *fs_info) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - - BUG_ON(!fs_info->balance_ctl); - - spin_lock(&fs_info->balance_lock); - fs_info->balance_ctl = NULL; - spin_unlock(&fs_info->balance_lock); - - kfree(bctl); -} - -/* - * Balance filters. Return 1 if chunk should be filtered out - * (should not be balanced). - */ -static int chunk_profiles_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) -{ - chunk_type = chunk_to_extended(chunk_type) & - BTRFS_EXTENDED_PROFILE_MASK; - - if (bargs->profiles & chunk_type) - return 0; - - return 1; -} - -static u64 div_factor_fine(u64 num, int factor) -{ - if (factor <= 0) - return 0; - if (factor >= 100) - return num; - - num *= factor; - do_div(num, 100); - return num; -} - -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, - struct btrfs_balance_args *bargs) -{ - struct btrfs_block_group_cache *cache; - u64 chunk_used, user_thresh; - int ret = 1; - - cache = btrfs_lookup_block_group(fs_info, chunk_offset); - chunk_used = btrfs_block_group_used(&cache->item); - - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); - if (chunk_used < user_thresh) - ret = 0; - - btrfs_put_block_group(cache); - return ret; -} - -static int chunk_devid_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) -{ - struct btrfs_stripe *stripe; - int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - int i; - - for (i = 0; i < num_stripes; i++) { - stripe = btrfs_stripe_nr(chunk, i); - if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) - return 0; - } - - return 1; -} - -/* [pstart, pend) */ -static int chunk_drange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) -{ - struct btrfs_stripe *stripe; - int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - u64 stripe_offset; - u64 stripe_length; - int factor; - int i; - - if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) - return 0; - - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) - factor = 2; - else - factor = 1; - factor = num_stripes / factor; - - for (i = 0; i < num_stripes; i++) { - stripe = btrfs_stripe_nr(chunk, i); - if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) - continue; - - stripe_offset = btrfs_stripe_offset(leaf, stripe); - stripe_length = btrfs_chunk_length(leaf, chunk); - do_div(stripe_length, factor); - - if (stripe_offset < bargs->pend && - stripe_offset + stripe_length > bargs->pstart) - return 0; - } - - return 1; -} - -/* [vstart, vend) */ -static int chunk_vrange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) -{ - if (chunk_offset < bargs->vend && - chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) - /* at least part of the chunk is inside this vrange */ - return 0; - - return 1; -} - -static int chunk_soft_convert_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) -{ - if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) - return 0; - - chunk_type = chunk_to_extended(chunk_type) & - BTRFS_EXTENDED_PROFILE_MASK; - - if (bargs->target == chunk_type) - return 1; - - return 0; -} - -static int should_balance_chunk(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 chunk_offset) -{ - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; - struct btrfs_balance_args *bargs = NULL; - u64 chunk_type = btrfs_chunk_type(leaf, chunk); - - /* type filter */ - if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & - (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { - return 0; - } - - if (chunk_type & BTRFS_BLOCK_GROUP_DATA) - bargs = &bctl->data; - else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) - bargs = &bctl->sys; - else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) - bargs = &bctl->meta; - - /* profiles filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && - chunk_profiles_filter(chunk_type, bargs)) { - return 0; - } - - /* usage filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && - chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { - return 0; - } - - /* devid filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && - chunk_devid_filter(leaf, chunk, bargs)) { - return 0; - } - - /* drange filter, makes sense only with devid filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && - chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; - } - - /* vrange filter */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && - chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; - } - - /* soft profile changing mode */ - if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && - chunk_soft_convert_filter(chunk_type, bargs)) { - return 0; - } - - return 1; -} - -static u64 div_factor(u64 num, int factor) -{ - if (factor == 10) - return num; - num *= factor; - do_div(num, 10); - return num; -} - -static int __btrfs_balance(struct btrfs_fs_info *fs_info) -{ - struct btrfs_balance_control *bctl = fs_info->balance_ctl; - struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; - struct btrfs_device *device; - u64 old_size; - u64 size_to_free; - struct btrfs_chunk *chunk; - struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_key found_key; - struct btrfs_trans_handle *trans; - struct extent_buffer *leaf; - int slot; - int ret; - int enospc_errors = 0; - bool counting = true; - - /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - old_size = device->total_bytes; - size_to_free = div_factor(old_size, 1); - size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); - if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free) - continue; - - ret = btrfs_shrink_device(device, old_size - size_to_free); - if (ret == -ENOSPC) - break; - BUG_ON(ret); - - trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(IS_ERR(trans)); - - ret = btrfs_grow_device(trans, device, old_size); - BUG_ON(ret); - - btrfs_end_transaction(trans, dev_root); - } - - /* step two, relocate all the chunks */ - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto error; - } - - /* zero out stat counters */ - spin_lock(&fs_info->balance_lock); - memset(&bctl->stat, 0, sizeof(bctl->stat)); - spin_unlock(&fs_info->balance_lock); -again: - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - while (1) { - if ((!counting && atomic_read(&fs_info->balance_pause_req)) || - atomic_read(&fs_info->balance_cancel_req)) { - ret = -ECANCELED; - goto error; - } - - ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) - goto error; - - /* - * this shouldn't happen, it means the last relocate - * failed - */ - if (ret == 0) - BUG(); /* FIXME break ? */ - - ret = btrfs_previous_item(chunk_root, path, 0, - BTRFS_CHUNK_ITEM_KEY); - if (ret) { - ret = 0; - break; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - if (found_key.objectid != key.objectid) - break; - - /* chunk zero is special */ - if (found_key.offset == 0) - break; - - chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - - if (!counting) { - spin_lock(&fs_info->balance_lock); - bctl->stat.considered++; - spin_unlock(&fs_info->balance_lock); - } - - ret = should_balance_chunk(chunk_root, leaf, chunk, - found_key.offset); - btrfs_release_path(path); - if (!ret) - goto loop; - - if (counting) { - spin_lock(&fs_info->balance_lock); - bctl->stat.expected++; - spin_unlock(&fs_info->balance_lock); - goto loop; - } - - ret = btrfs_relocate_chunk(chunk_root, - chunk_root->root_key.objectid, - found_key.objectid, - found_key.offset); - if (ret && ret != -ENOSPC) - goto error; - if (ret == -ENOSPC) { - enospc_errors++; - } else { - spin_lock(&fs_info->balance_lock); - bctl->stat.completed++; - spin_unlock(&fs_info->balance_lock); - } -loop: - key.offset = found_key.offset - 1; - } - - if (counting) { - btrfs_release_path(path); - counting = false; - goto again; - } -error: - btrfs_free_path(path); - if (enospc_errors) { - printk(KERN_INFO "btrfs: %d enospc errors during balance\n", - enospc_errors); - if (!ret) - ret = -ENOSPC; - } - - return ret; -} - -/** - * alloc_profile_is_valid - see if a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile - */ -static int alloc_profile_is_valid(u64 flags, int extended) -{ - u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : - BTRFS_BLOCK_GROUP_PROFILE_MASK); - - flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; - - /* 1) check that all other bits are zeroed */ - if (flags & ~mask) - return 0; - - /* 2) see if profile is reduced */ - if (flags == 0) - return !extended; /* "0" is valid for usual profiles */ - - /* true if exactly one bit set */ - return (flags & (flags - 1)) == 0; -} - -static inline int balance_need_close(struct btrfs_fs_info *fs_info) -{ - /* cancel requested || normal exit path */ - return atomic_read(&fs_info->balance_cancel_req) || - (atomic_read(&fs_info->balance_pause_req) == 0 && - atomic_read(&fs_info->balance_cancel_req) == 0); -} - -static void __cancel_balance(struct btrfs_fs_info *fs_info) -{ - int ret; - - unset_balance_control(fs_info); - ret = del_balance_item(fs_info->tree_root); - BUG_ON(ret); -} - -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, - struct btrfs_ioctl_balance_args *bargs); - -/* - * Should be called with both balance and volume mutexes held - */ -int btrfs_balance(struct btrfs_balance_control *bctl, - struct btrfs_ioctl_balance_args *bargs) -{ - struct btrfs_fs_info *fs_info = bctl->fs_info; - u64 allowed; - int mixed = 0; - int ret; - - if (btrfs_fs_closing(fs_info) || - atomic_read(&fs_info->balance_pause_req) || - atomic_read(&fs_info->balance_cancel_req)) { - ret = -EINVAL; - goto out; - } - - allowed = btrfs_super_incompat_flags(fs_info->super_copy); - if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; - - /* - * In case of mixed groups both data and meta should be picked, - * and identical options should be given for both of them. - */ - allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; - if (mixed && (bctl->flags & allowed)) { - if (!(bctl->flags & BTRFS_BALANCE_DATA) || - !(bctl->flags & BTRFS_BALANCE_METADATA) || - memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { - printk(KERN_ERR "btrfs: with mixed groups data and " - "metadata balance options must be the same\n"); - ret = -EINVAL; - goto out; - } - } - - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (fs_info->fs_devices->num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (fs_info->fs_devices->num_devices < 4) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - else - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10); - - if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->data.target, 1) || - (bctl->data.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "data profile %llu\n", - (unsigned long long)bctl->data.target); - ret = -EINVAL; - goto out; - } - if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->meta.target, 1) || - (bctl->meta.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "metadata profile %llu\n", - (unsigned long long)bctl->meta.target); - ret = -EINVAL; - goto out; - } - if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->sys.target, 1) || - (bctl->sys.target & ~allowed))) { - printk(KERN_ERR "btrfs: unable to start balance with target " - "system profile %llu\n", - (unsigned long long)bctl->sys.target); - ret = -EINVAL; - goto out; - } - - /* allow dup'ed data chunks only in mixed mode */ - if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - printk(KERN_ERR "btrfs: dup for data is not allowed\n"); - ret = -EINVAL; - goto out; - } - - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_system_alloc_bits & allowed) && - !(bctl->sys.target & allowed)) || - ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - printk(KERN_INFO "btrfs: force reducing metadata " - "integrity\n"); - } else { - printk(KERN_ERR "btrfs: balance will reduce metadata " - "integrity, use force if you want this\n"); - ret = -EINVAL; - goto out; - } - } - - ret = insert_balance_item(fs_info->tree_root, bctl); - if (ret && ret != -EEXIST) - goto out; - - if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { - BUG_ON(ret == -EEXIST); - set_balance_control(bctl); - } else { - BUG_ON(ret != -EEXIST); - spin_lock(&fs_info->balance_lock); - update_balance_args(bctl); - spin_unlock(&fs_info->balance_lock); - } - - atomic_inc(&fs_info->balance_running); - mutex_unlock(&fs_info->balance_mutex); - - ret = __btrfs_balance(fs_info); - - mutex_lock(&fs_info->balance_mutex); - atomic_dec(&fs_info->balance_running); - - if (bargs) { - memset(bargs, 0, sizeof(*bargs)); - update_ioctl_balance_args(fs_info, 0, bargs); - } - - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { - __cancel_balance(fs_info); - } - - wake_up(&fs_info->balance_wait_q); - - return ret; -out: - if (bctl->flags & BTRFS_BALANCE_RESUME) - __cancel_balance(fs_info); - else - kfree(bctl); - return ret; -} - -static int balance_kthread(void *data) -{ - struct btrfs_balance_control *bctl = - (struct btrfs_balance_control *)data; - struct btrfs_fs_info *fs_info = bctl->fs_info; - int ret = 0; - - mutex_lock(&fs_info->volume_mutex); - mutex_lock(&fs_info->balance_mutex); - - set_balance_control(bctl); - - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { - printk(KERN_INFO "btrfs: force skipping balance\n"); - } else { - printk(KERN_INFO "btrfs: continuing balance\n"); - ret = btrfs_balance(bctl, NULL); - } - - mutex_unlock(&fs_info->balance_mutex); - mutex_unlock(&fs_info->volume_mutex); - return ret; -} - -int btrfs_recover_balance(struct btrfs_root *tree_root) -{ - struct task_struct *tsk; - struct btrfs_balance_control *bctl; - struct btrfs_balance_item *item; - struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out; - } - - key.objectid = BTRFS_BALANCE_OBJECTID; - key.type = BTRFS_BALANCE_ITEM_KEY; - key.offset = 0; - - ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); - if (ret < 0) - goto out_bctl; - if (ret > 0) { /* ret = -ENOENT; */ - ret = 0; - goto out_bctl; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - - bctl->fs_info = tree_root->fs_info; - bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME; - - btrfs_balance_data(leaf, item, &disk_bargs); - btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); - btrfs_balance_meta(leaf, item, &disk_bargs); - btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); - btrfs_balance_sys(leaf, item, &disk_bargs); - btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - - tsk = kthread_run(balance_kthread, bctl, "btrfs-balance"); - if (IS_ERR(tsk)) - ret = PTR_ERR(tsk); - else - goto out; - -out_bctl: - kfree(bctl); -out: - btrfs_free_path(path); - return ret; -} - -int btrfs_pause_balance(struct btrfs_fs_info *fs_info) -{ - int ret = 0; - - mutex_lock(&fs_info->balance_mutex); - if (!fs_info->balance_ctl) { - mutex_unlock(&fs_info->balance_mutex); - return -ENOTCONN; - } - - if (atomic_read(&fs_info->balance_running)) { - atomic_inc(&fs_info->balance_pause_req); - mutex_unlock(&fs_info->balance_mutex); - - wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); - - mutex_lock(&fs_info->balance_mutex); - /* we are good with balance_ctl ripped off from under us */ - BUG_ON(atomic_read(&fs_info->balance_running)); - atomic_dec(&fs_info->balance_pause_req); - } else { - ret = -ENOTCONN; - } - - mutex_unlock(&fs_info->balance_mutex); - return ret; -} - -int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) -{ - mutex_lock(&fs_info->balance_mutex); - if (!fs_info->balance_ctl) { - mutex_unlock(&fs_info->balance_mutex); - return -ENOTCONN; - } - - atomic_inc(&fs_info->balance_cancel_req); - /* - * if we are running just wait and return, balance item is - * deleted in btrfs_balance in this case - */ - if (atomic_read(&fs_info->balance_running)) { - mutex_unlock(&fs_info->balance_mutex); - wait_event(fs_info->balance_wait_q, - atomic_read(&fs_info->balance_running) == 0); - mutex_lock(&fs_info->balance_mutex); - } else { - /* __cancel_balance needs volume_mutex */ - mutex_unlock(&fs_info->balance_mutex); - mutex_lock(&fs_info->volume_mutex); - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl) - __cancel_balance(fs_info); - - mutex_unlock(&fs_info->volume_mutex); - } - - BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running)); - atomic_dec(&fs_info->balance_cancel_req); - mutex_unlock(&fs_info->balance_mutex); - return 0; -} - -/* - * shrinking a device means finding all of the device extents past - * the new size, and then following the back refs to the chunks. - * The chunk relocation code actually frees the device extent - */ -int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; - struct btrfs_path *path; - u64 length; - u64 chunk_tree; - u64 chunk_objectid; - u64 chunk_offset; - int ret; - int slot; - int failed = 0; - bool retried = false; - struct extent_buffer *l; - struct btrfs_key key; - struct btrfs_super_block *super_copy = root->fs_info->super_copy; - u64 old_total = btrfs_super_total_bytes(super_copy); - u64 old_size = device->total_bytes; - u64 diff = device->total_bytes - new_size; - - if (new_size >= device->total_bytes) - return -EINVAL; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->reada = 2; - - lock_chunks(root); - - device->total_bytes = new_size; - if (device->writeable) { - device->fs_devices->total_rw_bytes -= diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space -= diff; - spin_unlock(&root->fs_info->free_chunk_lock); - } - unlock_chunks(root); - -again: - key.objectid = device->devid; - key.offset = (u64)-1; - key.type = BTRFS_DEV_EXTENT_KEY; - - do { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto done; - - ret = btrfs_previous_item(root, path, 0, key.type); - if (ret < 0) - goto done; - if (ret) { - ret = 0; - btrfs_release_path(path); - break; - } - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); - - if (key.objectid != device->devid) { - btrfs_release_path(path); - break; - } - - dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - length = btrfs_dev_extent_length(l, dev_extent); - - if (key.offset + length <= new_size) { - btrfs_release_path(path); - break; - } - - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); - chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); - btrfs_release_path(path); - - ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, - chunk_offset); - if (ret && ret != -ENOSPC) - goto done; - if (ret == -ENOSPC) - failed++; - } while (key.offset-- > 0); - - if (failed && !retried) { - failed = 0; - retried = true; - goto again; - } else if (failed && retried) { - ret = -ENOSPC; - lock_chunks(root); - - device->total_bytes = old_size; - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); - goto done; - } - - /* Shrinking succeeded, else we would be at "done". */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto done; - } - - lock_chunks(root); - - device->disk_total_bytes = new_size; - /* Now btrfs_update_device() will change the on-disk size. */ - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); - unlock_chunks(root); - btrfs_end_transaction(trans, root); -done: - btrfs_free_path(path); - return ret; -} - -static int btrfs_add_system_chunk(struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_chunk *chunk, int item_size) -{ - struct btrfs_super_block *super_copy = root->fs_info->super_copy; - struct btrfs_disk_key disk_key; - u32 array_size; - u8 *ptr; - - array_size = btrfs_super_sys_array_size(super_copy); - if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) - return -EFBIG; - - ptr = super_copy->sys_chunk_array + array_size; - btrfs_cpu_key_to_disk(&disk_key, key); - memcpy(ptr, &disk_key, sizeof(disk_key)); - ptr += sizeof(disk_key); - memcpy(ptr, chunk, item_size); - item_size += sizeof(disk_key); - btrfs_set_super_sys_array_size(super_copy, array_size + item_size); - return 0; -} - -/* - * sort the devices in descending order by max_avail, total_avail - */ -static int btrfs_cmp_device_info(const void *a, const void *b) -{ - const struct btrfs_device_info *di_a = a; - const struct btrfs_device_info *di_b = b; - - if (di_a->max_avail > di_b->max_avail) - return -1; - if (di_a->max_avail < di_b->max_avail) - return 1; - if (di_a->total_avail > di_b->total_avail) - return -1; - if (di_a->total_avail < di_b->total_avail) - return 1; - return 0; -} - -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes_out, u64 *stripe_size_out, - u64 start, u64 type) -{ - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct btrfs_device_info *devices_info = NULL; - u64 total_avail; - int num_stripes; /* total number of stripes to allocate */ - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int ret; - u64 max_stripe_size; - u64 max_chunk_size; - u64 stripe_size; - u64 num_bytes; - int ndevs; - int i; - int j; - - BUG_ON(!alloc_profile_is_valid(type, 0)); - - if (list_empty(&fs_devices->alloc_list)) - return -ENOSPC; - - sub_stripes = 1; - dev_stripes = 1; - devs_increment = 1; - ncopies = 1; - devs_max = 0; /* 0 == as many as possible */ - devs_min = 1; - - /* - * define the properties of each RAID type. - * FIXME: move this to a global table and use it in all RAID - * calculation code - */ - if (type & (BTRFS_BLOCK_GROUP_DUP)) { - dev_stripes = 2; - ncopies = 2; - devs_max = 1; - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { - devs_increment = 2; - ncopies = 2; - devs_max = 2; - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - sub_stripes = 2; - devs_increment = 2; - ncopies = 2; - devs_min = 4; - } else { - devs_max = 1; - } - - if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = 1024 * 1024 * 1024; - max_chunk_size = 10 * max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) - max_stripe_size = 1024 * 1024 * 1024; - else - max_stripe_size = 256 * 1024 * 1024; - max_chunk_size = max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 32 * 1024 * 1024; - max_chunk_size = 2 * max_stripe_size; - } else { - printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", - type); - BUG_ON(1); - } - - /* we don't want a chunk larger than 10% of writeable space */ - max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - max_chunk_size); - - devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, - GFP_NOFS); - if (!devices_info) - return -ENOMEM; - - cur = fs_devices->alloc_list.next; - - /* - * in the first pass through the devices list, we gather information - * about the available holes on each device. - */ - ndevs = 0; - while (cur != &fs_devices->alloc_list) { - struct btrfs_device *device; - u64 max_avail; - u64 dev_offset; - - device = list_entry(cur, struct btrfs_device, dev_alloc_list); - - cur = cur->next; - - if (!device->writeable) { - printk(KERN_ERR - "btrfs: read-only device in alloc_list\n"); - WARN_ON(1); - continue; - } - - if (!device->in_fs_metadata) - continue; - - if (device->total_bytes > device->bytes_used) - total_avail = device->total_bytes - device->bytes_used; - else - total_avail = 0; - - /* If there is no space on this device, skip it. */ - if (total_avail == 0) - continue; - - ret = find_free_dev_extent(device, - max_stripe_size * dev_stripes, - &dev_offset, &max_avail); - if (ret && ret != -ENOSPC) - goto error; - - if (ret == 0) - max_avail = max_stripe_size * dev_stripes; - - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) - continue; - - devices_info[ndevs].dev_offset = dev_offset; - devices_info[ndevs].max_avail = max_avail; - devices_info[ndevs].total_avail = total_avail; - devices_info[ndevs].dev = device; - ++ndevs; - } - - /* - * now sort the devices by hole size / available space - */ - sort(devices_info, ndevs, sizeof(struct btrfs_device_info), - btrfs_cmp_device_info, NULL); - - /* round down to number of usable stripes */ - ndevs -= ndevs % devs_increment; - - if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { - ret = -ENOSPC; - goto error; - } - - if (devs_max && ndevs > devs_max) - ndevs = devs_max; - /* - * the primary goal is to maximize the number of stripes, so use as many - * devices as possible, even if the stripes are not maximum sized. - */ - stripe_size = devices_info[ndevs-1].max_avail; - num_stripes = ndevs * dev_stripes; - - if (stripe_size * ndevs > max_chunk_size * ncopies) { - stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, ndevs); - } - - do_div(stripe_size, dev_stripes); - - /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, BTRFS_STRIPE_LEN); - stripe_size *= BTRFS_STRIPE_LEN; - - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - ret = -ENOMEM; - goto error; - } - map->num_stripes = num_stripes; - - for (i = 0; i < ndevs; ++i) { - for (j = 0; j < dev_stripes; ++j) { - int s = i * dev_stripes + j; - map->stripes[s].dev = devices_info[i].dev; - map->stripes[s].physical = devices_info[i].dev_offset + - j * stripe_size; - } - } - map->sector_size = extent_root->sectorsize; - map->stripe_len = BTRFS_STRIPE_LEN; - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; - map->type = type; - map->sub_stripes = sub_stripes; - - *map_ret = map; - num_bytes = stripe_size * (num_stripes / ncopies); - - *stripe_size_out = stripe_size; - *num_bytes_out = num_bytes; - - trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); - - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto error; - } - em->bdev = (struct block_device *)map; - em->start = start; - em->len = num_bytes; - em->block_start = 0; - em->block_len = em->len; - - em_tree = &extent_root->fs_info->mapping_tree.map_tree; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - free_extent_map(em); - if (ret) - goto error; - - ret = btrfs_make_block_group(trans, extent_root, 0, type, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, num_bytes); - if (ret) - goto error; - - for (i = 0; i < map->num_stripes; ++i) { - struct btrfs_device *device; - u64 dev_offset; - - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; - - ret = btrfs_alloc_dev_extent(trans, device, - info->chunk_root->root_key.objectid, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, dev_offset, stripe_size); - if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); - goto error; - } - } - - kfree(devices_info); - return 0; - -error: - kfree(map); - kfree(devices_info); - return ret; -} - -static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup *map, u64 chunk_offset, - u64 chunk_size, u64 stripe_size) -{ - u64 dev_offset; - struct btrfs_key key; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; - struct btrfs_device *device; - struct btrfs_chunk *chunk; - struct btrfs_stripe *stripe; - size_t item_size = btrfs_chunk_item_size(map->num_stripes); - int index = 0; - int ret; - - chunk = kzalloc(item_size, GFP_NOFS); - if (!chunk) - return -ENOMEM; - - index = 0; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - device->bytes_used += stripe_size; - ret = btrfs_update_device(trans, device); - if (ret) - goto out_free; - index++; - } - - spin_lock(&extent_root->fs_info->free_chunk_lock); - extent_root->fs_info->free_chunk_space -= (stripe_size * - map->num_stripes); - spin_unlock(&extent_root->fs_info->free_chunk_lock); - - index = 0; - stripe = &chunk->stripe; - while (index < map->num_stripes) { - device = map->stripes[index].dev; - dev_offset = map->stripes[index].physical; - - btrfs_set_stack_stripe_devid(stripe, device->devid); - btrfs_set_stack_stripe_offset(stripe, dev_offset); - memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); - stripe++; - index++; - } - - btrfs_set_stack_chunk_length(chunk, chunk_size); - btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); - btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); - btrfs_set_stack_chunk_type(chunk, map->type); - btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); - btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); - btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); - btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); - btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); - - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.type = BTRFS_CHUNK_ITEM_KEY; - key.offset = chunk_offset; - - ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); - - if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - /* - * TODO: Cleanup of inserted chunk root in case of - * failure. - */ - ret = btrfs_add_system_chunk(chunk_root, &key, chunk, - item_size); - } - -out_free: - kfree(chunk); - return ret; -} - -/* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the - * bootstrap process of adding storage to a seed btrfs. - */ -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 type) -{ - u64 chunk_offset; - u64 chunk_size; - u64 stripe_size; - struct map_lookup *map; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; - int ret; - - ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - &chunk_offset); - if (ret) - return ret; - - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, type); - if (ret) - return ret; - - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) - return ret; - return 0; -} - -static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device) -{ - u64 chunk_offset; - u64 sys_chunk_offset; - u64 chunk_size; - u64 sys_chunk_size; - u64 stripe_size; - u64 sys_stripe_size; - u64 alloc_profile; - struct map_lookup *map; - struct map_lookup *sys_map; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; - int ret; - - ret = find_next_chunk(fs_info->chunk_root, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); - if (ret) - return ret; - - alloc_profile = BTRFS_BLOCK_GROUP_METADATA | - fs_info->avail_metadata_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - - ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, - &stripe_size, chunk_offset, alloc_profile); - if (ret) - return ret; - - sys_chunk_offset = chunk_offset + chunk_size; - - alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | - fs_info->avail_system_alloc_bits; - alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); - - ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, - &sys_chunk_size, &sys_stripe_size, - sys_chunk_offset, alloc_profile); - if (ret) - goto abort; - - ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) - goto abort; - - /* - * Modifying chunk tree needs allocating new blocks from both - * system block group and metadata block group. So we only can - * do operations require modifying the chunk tree after both - * block groups were created. - */ - ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, - chunk_size, stripe_size); - if (ret) - goto abort; - - ret = __finish_chunk_alloc(trans, extent_root, sys_map, - sys_chunk_offset, sys_chunk_size, - sys_stripe_size); - if (ret) - goto abort; - - return 0; - -abort: - btrfs_abort_transaction(trans, root, ret); - return ret; -} - -int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) -{ - struct extent_map *em; - struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - int readonly = 0; - int i; - - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) - return 1; - - if (btrfs_test_opt(root, DEGRADED)) { - free_extent_map(em); - return 0; - } - - map = (struct map_lookup *)em->bdev; - for (i = 0; i < map->num_stripes; i++) { - if (!map->stripes[i].dev->writeable) { - readonly = 1; - break; - } - } - free_extent_map(em); - return readonly; -} - -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) -{ - extent_map_tree_init(&tree->map_tree); -} - -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) -{ - struct extent_map *em; - - while (1) { - write_lock(&tree->map_tree.lock); - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); - if (em) - remove_extent_mapping(&tree->map_tree, em); - write_unlock(&tree->map_tree.lock); - if (!em) - break; - kfree(em->bdev); - /* once for us */ - free_extent_map(em); - /* once for the tree */ - free_extent_map(em); - } -} - -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) -{ - struct extent_map *em; - struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); - - BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) - ret = map->num_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID10) - ret = map->sub_stripes; - else - ret = 1; - free_extent_map(em); - return ret; -} - -static int find_live_mirror(struct map_lookup *map, int first, int num, - int optimal) -{ - int i; - if (map->stripes[optimal].dev->bdev) - return optimal; - for (i = first; i < first + num; i++) { - if (map->stripes[i].dev->bdev) - return i; - } - /* we couldn't find one that doesn't fail. Just return something - * and the io error handling code will clean up eventually - */ - return optimal; -} - -static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, - u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, - int mirror_num) -{ - struct extent_map *em; - struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; - u64 offset; - u64 stripe_offset; - u64 stripe_end_offset; - u64 stripe_nr; - u64 stripe_nr_orig; - u64 stripe_nr_end; - int stripe_index; - int i; - int ret = 0; - int num_stripes; - int max_errors = 0; - struct btrfs_bio *bbio = NULL; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, *length); - read_unlock(&em_tree->lock); - - if (!em) { - printk(KERN_CRIT "unable to find logical %llu len %llu\n", - (unsigned long long)logical, - (unsigned long long)*length); - BUG(); - } - - BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; - offset = logical - em->start; - - if (mirror_num > map->num_stripes) - mirror_num = 0; - - stripe_nr = offset; - /* - * stripe_nr counts the total number of stripes we have to stride - * to get to this block - */ - do_div(stripe_nr, map->stripe_len); - - stripe_offset = stripe_nr * map->stripe_len; - BUG_ON(offset < stripe_offset); - - /* stripe_offset is the offset of this block in its stripe*/ - stripe_offset = offset - stripe_offset; - - if (rw & REQ_DISCARD) - *length = min_t(u64, em->len - offset, *length); - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - /* we limit the length of each bio to what fits in a stripe */ - *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); - } else { - *length = em->len - offset; - } - - if (!bbio_ret) - goto out; - - num_stripes = 1; - stripe_index = 0; - stripe_nr_orig = stripe_nr; - stripe_nr_end = (offset + *length + map->stripe_len - 1) & - (~(map->stripe_len - 1)); - do_div(stripe_nr_end, map->stripe_len); - stripe_end_offset = stripe_nr_end * map->stripe_len - - (offset + *length); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (rw & REQ_DISCARD) - num_stripes = min_t(u64, map->num_stripes, - stripe_nr_end - stripe_nr_orig); - stripe_index = do_div(stripe_nr, map->num_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD)) - num_stripes = map->num_stripes; - else if (mirror_num) - stripe_index = mirror_num - 1; - else { - stripe_index = find_live_mirror(map, 0, - map->num_stripes, - current->pid % map->num_stripes); - mirror_num = stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - mirror_num = 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; - - stripe_index = do_div(stripe_nr, factor); - stripe_index *= map->sub_stripes; - - if (rw & REQ_WRITE) - num_stripes = map->sub_stripes; - else if (rw & REQ_DISCARD) - num_stripes = min_t(u64, map->sub_stripes * - (stripe_nr_end - stripe_nr_orig), - map->num_stripes); - else if (mirror_num) - stripe_index += mirror_num - 1; - else { - int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(map, stripe_index, - map->sub_stripes, stripe_index + - current->pid % map->sub_stripes); - mirror_num = stripe_index - old_stripe_index + 1; - } - } else { - /* - * after this do_div call, stripe_nr is the number of stripes - * on this device we have to walk to find the data, and - * stripe_index is the number of our device in the stripe array - */ - stripe_index = do_div(stripe_nr, map->num_stripes); - mirror_num = stripe_index + 1; - } - BUG_ON(stripe_index >= map->num_stripes); - - bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); - if (!bbio) { - ret = -ENOMEM; - goto out; - } - atomic_set(&bbio->error, 0); - - if (rw & REQ_DISCARD) { - int factor = 0; - int sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - u32 last_stripe = 0; - - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - - /* - * Special for the first stripe and - * the last stripe: - * - * |-------|...|-------| - * |----------| - * off end_off - */ - if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; - - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; - - if (i == sub_stripes - 1) - stripe_offset = 0; - } else - bbio->stripes[i].length = *length; - - stripe_index++; - if (stripe_index == map->num_stripes) { - /* This could only happen for RAID0/10 */ - stripe_index = 0; - stripe_nr++; - } - } - } else { - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; - } - } - - if (rw & REQ_WRITE) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } - } - - *bbio_ret = bbio; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; -out: - free_extent_map(em); - return ret; -} - -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, - u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num) -{ - return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, - mirror_num); -} - -int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len) -{ - struct extent_map_tree *em_tree = &map_tree->map_tree; - struct extent_map *em; - struct map_lookup *map; - u64 *buf; - u64 bytenr; - u64 length; - u64 stripe_nr; - int i, j, nr = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_start, 1); - read_unlock(&em_tree->lock); - - BUG_ON(!em || em->start != chunk_start); - map = (struct map_lookup *)em->bdev; - - length = em->len; - if (map->type & BTRFS_BLOCK_GROUP_RAID10) - do_div(length, map->num_stripes / map->sub_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - do_div(length, map->num_stripes); - - buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); - BUG_ON(!buf); /* -ENOMEM */ - - for (i = 0; i < map->num_stripes; i++) { - if (devid && map->stripes[i].dev->devid != devid) - continue; - if (map->stripes[i].physical > physical || - map->stripes[i].physical + length <= physical) - continue; - - stripe_nr = physical - map->stripes[i].physical; - do_div(stripe_nr, map->stripe_len); - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripe_nr = stripe_nr * map->num_stripes + i; - do_div(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; - WARN_ON(nr >= map->num_stripes); - for (j = 0; j < nr; j++) { - if (buf[j] == bytenr) - break; - } - if (j == nr) { - WARN_ON(nr >= map->num_stripes); - buf[nr++] = bytenr; - } - } - - *logical = buf; - *naddrs = nr; - *stripe_len = map->stripe_len; - - free_extent_map(em); - return 0; -} - -static void btrfs_end_bio(struct bio *bio, int err) -{ - struct btrfs_bio *bbio = bio->bi_private; - int is_orig_bio = 0; - - if (err) - atomic_inc(&bbio->error); - - if (bio == bbio->orig_bio) - is_orig_bio = 1; - - if (atomic_dec_and_test(&bbio->stripes_pending)) { - if (!is_orig_bio) { - bio_put(bio); - bio = bbio->orig_bio; - } - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; - bio->bi_bdev = (struct block_device *) - (unsigned long)bbio->mirror_num; - /* only send an error to the higher layers if it is - * beyond the tolerance of the multi-bio - */ - if (atomic_read(&bbio->error) > bbio->max_errors) { - err = -EIO; - } else { - /* - * this bio is actually up to date, we didn't - * go over the max number of errors - */ - set_bit(BIO_UPTODATE, &bio->bi_flags); - err = 0; - } - kfree(bbio); - - bio_endio(bio, err); - } else if (!is_orig_bio) { - bio_put(bio); - } -} - -struct async_sched { - struct bio *bio; - int rw; - struct btrfs_fs_info *info; - struct btrfs_work work; -}; - -/* - * see run_scheduled_bios for a description of why bios are collected for - * async submit. - * - * This will add one bio to the pending list for a device and make sure - * the work struct is scheduled. - */ -static noinline void schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, - int rw, struct bio *bio) -{ - int should_queue = 1; - struct btrfs_pending_bios *pending_bios; - - /* don't bother with additional async steps for reads, right now */ - if (!(rw & REQ_WRITE)) { - bio_get(bio); - btrfsic_submit_bio(rw, bio); - bio_put(bio); - return; - } - - /* - * nr_async_bios allows us to reliably return congestion to the - * higher layers. Otherwise, the async bio makes it appear we have - * made progress against dirty pages when we've really just put it - * on a queue for later - */ - atomic_inc(&root->fs_info->nr_async_bios); - WARN_ON(bio->bi_next); - bio->bi_next = NULL; - bio->bi_rw |= rw; - - spin_lock(&device->io_lock); - if (bio->bi_rw & REQ_SYNC) - pending_bios = &device->pending_sync_bios; - else - pending_bios = &device->pending_bios; - - if (pending_bios->tail) - pending_bios->tail->bi_next = bio; - - pending_bios->tail = bio; - if (!pending_bios->head) - pending_bios->head = bio; - if (device->running_pending) - should_queue = 0; - - spin_unlock(&device->io_lock); - - if (should_queue) - btrfs_queue_worker(&root->fs_info->submit_workers, - &device->work); -} - -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, - int mirror_num, int async_submit) -{ - struct btrfs_mapping_tree *map_tree; - struct btrfs_device *dev; - struct bio *first_bio = bio; - u64 logical = (u64)bio->bi_sector << 9; - u64 length = 0; - u64 map_length; - int ret; - int dev_nr = 0; - int total_devs = 1; - struct btrfs_bio *bbio = NULL; - - length = bio->bi_size; - map_tree = &root->fs_info->mapping_tree; - map_length = length; - - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, - mirror_num); - if (ret) /* -ENOMEM */ - return ret; - - total_devs = bbio->num_stripes; - if (map_length < length) { - printk(KERN_CRIT "mapping failed logical %llu bio len %llu " - "len %llu\n", (unsigned long long)logical, - (unsigned long long)length, - (unsigned long long)map_length); - BUG(); - } - - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); - - while (dev_nr < total_devs) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); /* -ENOMEM */ - } else { - bio = first_bio; - } - bio->bi_private = bbio; - bio->bi_end_io = btrfs_end_bio; - bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; - dev = bbio->stripes[dev_nr].dev; - if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { - pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - dev->name, dev->devid, bio->bi_size); - bio->bi_bdev = dev->bdev; - if (async_submit) - schedule_bio(root, dev, rw, bio); - else - btrfsic_submit_bio(rw, bio); - } else { - bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; - bio->bi_sector = logical >> 9; - bio_endio(bio, -EIO); - } - dev_nr++; - } - return 0; -} - -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, - u8 *uuid, u8 *fsid) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *cur_devices; - - cur_devices = root->fs_info->fs_devices; - while (cur_devices) { - if (!fsid || - !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { - device = __find_device(&cur_devices->devices, - devid, uuid); - if (device) - return device; - } - cur_devices = cur_devices->seed; - } - return NULL; -} - -static struct btrfs_device *add_missing_dev(struct btrfs_root *root, - u64 devid, u8 *dev_uuid) -{ - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) - return NULL; - list_add(&device->dev_list, - &fs_devices->devices); - device->dev_root = root->fs_info->dev_root; - device->devid = devid; - device->work.func = pending_bios_fn; - device->fs_devices = fs_devices; - device->missing = 1; - fs_devices->num_devices++; - fs_devices->missing_devices++; - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_alloc_list); - memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); - return device; -} - -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) -{ - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; - u64 length; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; - - logical = key->offset; - length = btrfs_chunk_length(leaf, chunk); - - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - read_unlock(&map_tree->map_tree.lock); - - /* already mapped? */ - if (em && em->start <= logical && em->start + em->len > logical) { - free_extent_map(em); - return 0; - } else if (em) { - free_extent_map(em); - } - - em = alloc_extent_map(); - if (!em) - return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - free_extent_map(em); - return -ENOMEM; - } - - em->bdev = (struct block_device *)map; - em->start = logical; - em->len = length; - em->block_start = 0; - em->block_len = em->len; - - map->num_stripes = num_stripes; - map->io_width = btrfs_chunk_io_width(leaf, chunk); - map->io_align = btrfs_chunk_io_align(leaf, chunk); - map->sector_size = btrfs_chunk_sector_size(leaf, chunk); - map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - map->type = btrfs_chunk_type(leaf, chunk); - map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - for (i = 0; i < num_stripes; i++) { - map->stripes[i].physical = - btrfs_stripe_offset_nr(leaf, chunk, i); - devid = btrfs_stripe_devid_nr(leaf, chunk, i); - read_extent_buffer(leaf, uuid, (unsigned long) - btrfs_stripe_dev_uuid_nr(chunk, i), - BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(root, devid, uuid, - NULL); - if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { - kfree(map); - free_extent_map(em); - return -EIO; - } - if (!map->stripes[i].dev) { - map->stripes[i].dev = - add_missing_dev(root, devid, uuid); - if (!map->stripes[i].dev) { - kfree(map); - free_extent_map(em); - return -EIO; - } - } - map->stripes[i].dev->in_fs_metadata = 1; - } - - write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em); - write_unlock(&map_tree->map_tree.lock); - BUG_ON(ret); /* Tree corruption */ - free_extent_map(em); - - return 0; -} - -static void fill_device_from_item(struct extent_buffer *leaf, - struct btrfs_dev_item *dev_item, - struct btrfs_device *device) -{ - unsigned long ptr; - - device->devid = btrfs_device_id(leaf, dev_item); - device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); - device->total_bytes = device->disk_total_bytes; - device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); - device->type = btrfs_device_type(leaf, dev_item); - device->io_align = btrfs_device_io_align(leaf, dev_item); - device->io_width = btrfs_device_io_width(leaf, dev_item); - device->sector_size = btrfs_device_sector_size(leaf, dev_item); - - ptr = (unsigned long)btrfs_device_uuid(dev_item); - read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); -} - -static int open_seed_devices(struct btrfs_root *root, u8 *fsid) -{ - struct btrfs_fs_devices *fs_devices; - int ret; - - BUG_ON(!mutex_is_locked(&uuid_mutex)); - - fs_devices = root->fs_info->fs_devices->seed; - while (fs_devices) { - if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { - ret = 0; - goto out; - } - fs_devices = fs_devices->seed; - } - - fs_devices = find_fsid(fsid); - if (!fs_devices) { - ret = -ENOENT; - goto out; - } - - fs_devices = clone_fs_devices(fs_devices); - if (IS_ERR(fs_devices)) { - ret = PTR_ERR(fs_devices); - goto out; - } - - ret = __btrfs_open_devices(fs_devices, FMODE_READ, - root->fs_info->bdev_holder); - if (ret) { - free_fs_devices(fs_devices); - goto out; - } - - if (!fs_devices->seeding) { - __btrfs_close_devices(fs_devices); - free_fs_devices(fs_devices); - ret = -EINVAL; - goto out; - } - - fs_devices->seed = root->fs_info->fs_devices->seed; - root->fs_info->fs_devices->seed = fs_devices; -out: - return ret; -} - -static int read_one_dev(struct btrfs_root *root, - struct extent_buffer *leaf, - struct btrfs_dev_item *dev_item) -{ - struct btrfs_device *device; - u64 devid; - int ret; - u8 fs_uuid[BTRFS_UUID_SIZE]; - u8 dev_uuid[BTRFS_UUID_SIZE]; - - devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), - BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), - BTRFS_UUID_SIZE); - - if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { - ret = open_seed_devices(root, fs_uuid); - if (ret && !btrfs_test_opt(root, DEGRADED)) - return ret; - } - - device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); - if (!device || !device->bdev) { - if (!btrfs_test_opt(root, DEGRADED)) - return -EIO; - - if (!device) { - printk(KERN_WARNING "warning devid %llu missing\n", - (unsigned long long)devid); - device = add_missing_dev(root, devid, dev_uuid); - if (!device) - return -ENOMEM; - } else if (!device->missing) { - /* - * this happens when a device that was properly setup - * in the device info lists suddenly goes bad. - * device->bdev is NULL, and so we have to set - * device->missing to one here - */ - root->fs_info->fs_devices->missing_devices++; - device->missing = 1; - } - } - - if (device->fs_devices != root->fs_info->fs_devices) { - BUG_ON(device->writeable); - if (device->generation != - btrfs_device_generation(leaf, dev_item)) - return -EINVAL; - } - - fill_device_from_item(leaf, dev_item, device); - device->dev_root = root->fs_info->dev_root; - device->in_fs_metadata = 1; - if (device->writeable) { - device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += device->total_bytes - - device->bytes_used; - spin_unlock(&root->fs_info->free_chunk_lock); - } - ret = 0; - return ret; -} - -int btrfs_read_sys_array(struct btrfs_root *root) -{ - struct btrfs_super_block *super_copy = root->fs_info->super_copy; - struct extent_buffer *sb; - struct btrfs_disk_key *disk_key; - struct btrfs_chunk *chunk; - u8 *ptr; - unsigned long sb_ptr; - int ret = 0; - u32 num_stripes; - u32 array_size; - u32 len = 0; - u32 cur; - struct btrfs_key key; - - sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, - BTRFS_SUPER_INFO_SIZE); - if (!sb) - return -ENOMEM; - btrfs_set_buffer_uptodate(sb); - btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); - /* - * The sb extent buffer is artifical and just used to read the system array. - * btrfs_set_buffer_uptodate() call does not properly mark all it's - * pages up-to-date when the page is larger: extent does not cover the - * whole page and consequently check_page_uptodate does not find all - * the page's extents up-to-date (the hole beyond sb), - * write_extent_buffer then triggers a WARN_ON. - * - * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, - * but sb spans only this function. Add an explicit SetPageUptodate call - * to silence the warning eg. on PowerPC 64. - */ - if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->pages[0]); - - write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); - array_size = btrfs_super_sys_array_size(super_copy); - - ptr = super_copy->sys_chunk_array; - sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); - cur = 0; - - while (cur < array_size) { - disk_key = (struct btrfs_disk_key *)ptr; - btrfs_disk_key_to_cpu(&key, disk_key); - - len = sizeof(*disk_key); ptr += len; - sb_ptr += len; - cur += len; - - if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_ptr; - ret = read_one_chunk(root, &key, sb, chunk); - if (ret) - break; - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - len = btrfs_chunk_item_size(num_stripes); - } else { - ret = -EIO; - break; - } - ptr += len; - sb_ptr += len; - cur += len; - } - free_extent_buffer(sb); - return ret; -} - -int btrfs_read_chunk_tree(struct btrfs_root *root) -{ - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_key key; - struct btrfs_key found_key; - int ret; - int slot; - - root = root->fs_info->chunk_root; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - mutex_lock(&uuid_mutex); - lock_chunks(root); - - /* first we search for all of the device items, and then we - * read in all of the chunk items. This way we can create chunk - * mappings that reference all of the devices that are afound - */ - key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; - key.type = 0; -again: - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) - break; - if (found_key.type == BTRFS_DEV_ITEM_KEY) { - struct btrfs_dev_item *dev_item; - dev_item = btrfs_item_ptr(leaf, slot, - struct btrfs_dev_item); - ret = read_one_dev(root, leaf, dev_item); - if (ret) - goto error; - } - } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { - struct btrfs_chunk *chunk; - chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = read_one_chunk(root, &found_key, leaf, chunk); - if (ret) - goto error; - } - path->slots[0]++; - } - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - key.objectid = 0; - btrfs_release_path(path); - goto again; - } - ret = 0; -error: - unlock_chunks(root); - mutex_unlock(&uuid_mutex); - - btrfs_free_path(path); - return ret; -} diff --git a/ANDROID_3.4.5/fs/btrfs/volumes.h b/ANDROID_3.4.5/fs/btrfs/volumes.h deleted file mode 100644 index bb6b03f9..00000000 --- a/ANDROID_3.4.5/fs/btrfs/volumes.h +++ /dev/null @@ -1,284 +0,0 @@ -/* - * Copyright (C) 2007 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_VOLUMES_ -#define __BTRFS_VOLUMES_ - -#include <linux/bio.h> -#include <linux/sort.h> -#include "async-thread.h" - -#define BTRFS_STRIPE_LEN (64 * 1024) - -struct buffer_head; -struct btrfs_pending_bios { - struct bio *head; - struct bio *tail; -}; - -struct btrfs_device { - struct list_head dev_list; - struct list_head dev_alloc_list; - struct btrfs_fs_devices *fs_devices; - struct btrfs_root *dev_root; - - /* regular prio bios */ - struct btrfs_pending_bios pending_bios; - /* WRITE_SYNC bios */ - struct btrfs_pending_bios pending_sync_bios; - - int running_pending; - u64 generation; - - int writeable; - int in_fs_metadata; - int missing; - int can_discard; - - spinlock_t io_lock; - - struct block_device *bdev; - - /* the mode sent to blkdev_get */ - fmode_t mode; - - char *name; - - /* the internal btrfs device id */ - u64 devid; - - /* size of the device */ - u64 total_bytes; - - /* size of the disk */ - u64 disk_total_bytes; - - /* bytes used */ - u64 bytes_used; - - /* optimal io alignment for this device */ - u32 io_align; - - /* optimal io width for this device */ - u32 io_width; - - /* minimal io size for this device */ - u32 sector_size; - - /* type and info about this device */ - u64 type; - - /* physical drive uuid (or lvm uuid) */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* per-device scrub information */ - struct scrub_dev *scrub_device; - - struct btrfs_work work; - struct rcu_head rcu; - struct work_struct rcu_work; - - /* readahead state */ - spinlock_t reada_lock; - atomic_t reada_in_flight; - u64 reada_next; - struct reada_zone *reada_curr_zone; - struct radix_tree_root reada_zones; - struct radix_tree_root reada_extents; - - /* for sending down flush barriers */ - struct bio *flush_bio; - struct completion flush_wait; - int nobarriers; - -}; - -struct btrfs_fs_devices { - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - - /* the device with this id has the most recent copy of the super */ - u64 latest_devid; - u64 latest_trans; - u64 num_devices; - u64 open_devices; - u64 rw_devices; - u64 missing_devices; - u64 total_rw_bytes; - u64 num_can_discard; - struct block_device *latest_bdev; - - /* all of the devices in the FS, protected by a mutex - * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code - */ - struct mutex device_list_mutex; - struct list_head devices; - - /* devices not currently being allocated */ - struct list_head alloc_list; - struct list_head list; - - struct btrfs_fs_devices *seed; - int seeding; - - int opened; - - /* set when we find or add a device that doesn't have the - * nonrot flag set - */ - int rotating; -}; - -struct btrfs_bio_stripe { - struct btrfs_device *dev; - u64 physical; - u64 length; /* only used for discard mappings */ -}; - -struct btrfs_bio; -typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); - -struct btrfs_bio { - atomic_t stripes_pending; - bio_end_io_t *end_io; - struct bio *orig_bio; - void *private; - atomic_t error; - int max_errors; - int num_stripes; - int mirror_num; - struct btrfs_bio_stripe stripes[]; -}; - -struct btrfs_device_info { - struct btrfs_device *dev; - u64 dev_offset; - u64 max_avail; - u64 total_avail; -}; - -struct map_lookup { - u64 type; - int io_align; - int io_width; - int stripe_len; - int sector_size; - int num_stripes; - int sub_stripes; - struct btrfs_bio_stripe stripes[]; -}; - -#define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) - -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - -struct btrfs_balance_args; -struct btrfs_balance_progress; -struct btrfs_balance_control { - struct btrfs_fs_info *fs_info; - - struct btrfs_balance_args data; - struct btrfs_balance_args meta; - struct btrfs_balance_args sys; - - u64 flags; - - struct btrfs_balance_progress stat; -}; - -int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, - u64 end, u64 *length); - -#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) - -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes); -int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, - u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, - u64 chunk_start, u64 physical, u64 devid, - u64 **logical, int *naddrs, int *stripe_len); -int btrfs_read_sys_array(struct btrfs_root *root); -int btrfs_read_chunk_tree(struct btrfs_root *root); -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 type); -void btrfs_mapping_init(struct btrfs_mapping_tree *tree); -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, - int mirror_num, int async_submit); -int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder); -int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, - struct btrfs_fs_devices **fs_devices_ret); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device); -int btrfs_rm_device(struct btrfs_root *root, char *device_path); -void btrfs_cleanup_fs_uuids(void); -int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); -int btrfs_grow_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, - u8 *uuid, u8 *fsid); -int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); -int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_balance_control *bctl, - struct btrfs_ioctl_balance_args *bargs); -int btrfs_recover_balance(struct btrfs_root *tree_root); -int btrfs_pause_balance(struct btrfs_fs_info *fs_info); -int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail); -#endif diff --git a/ANDROID_3.4.5/fs/btrfs/xattr.c b/ANDROID_3.4.5/fs/btrfs/xattr.c deleted file mode 100644 index e7a56590..00000000 --- a/ANDROID_3.4.5/fs/btrfs/xattr.c +++ /dev/null @@ -1,429 +0,0 @@ -/* - * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rwsem.h> -#include <linux/xattr.h> -#include <linux/security.h> -#include "ctree.h" -#include "btrfs_inode.h" -#include "transaction.h" -#include "xattr.h" -#include "disk-io.h" - - -ssize_t __btrfs_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size) -{ - struct btrfs_dir_item *di; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct extent_buffer *leaf; - int ret = 0; - unsigned long data_ptr; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* lookup the xattr by name */ - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, - strlen(name), 0); - if (!di) { - ret = -ENODATA; - goto out; - } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - - leaf = path->nodes[0]; - /* if size is 0, that means we want the size of the attr */ - if (!size) { - ret = btrfs_dir_data_len(leaf, di); - goto out; - } - - /* now get the data out of our dir_item */ - if (btrfs_dir_data_len(leaf, di) > size) { - ret = -ERANGE; - goto out; - } - - /* - * The way things are packed into the leaf is like this - * |struct btrfs_dir_item|name|data| - * where name is the xattr name, so security.foo, and data is the - * content of the xattr. data_ptr points to the location in memory - * where the data starts in the in memory leaf - */ - data_ptr = (unsigned long)((char *)(di + 1) + - btrfs_dir_name_len(leaf, di)); - read_extent_buffer(leaf, buffer, data_ptr, - btrfs_dir_data_len(leaf, di)); - ret = btrfs_dir_data_len(leaf, di); - -out: - btrfs_free_path(path); - return ret; -} - -static int do_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - struct btrfs_dir_item *di; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - size_t name_len = strlen(name); - int ret = 0; - - if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) - return -ENOSPC; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - if (flags & XATTR_REPLACE) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { - ret = -ENODATA; - goto out; - } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; - btrfs_release_path(path); - - /* - * remove the attribute - */ - if (!value) - goto out; - } - -again: - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - /* - * If we're setting an xattr to a new value but the new value is say - * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting - * back from split_leaf. This is because it thinks we'll be extending - * the existing item size, but we're asking for enough space to add the - * item itself. So if we get EOVERFLOW just set ret to EEXIST and let - * the rest of the function figure it out. - */ - if (ret == -EOVERFLOW) - ret = -EEXIST; - - if (ret == -EEXIST) { - if (flags & XATTR_CREATE) - goto out; - /* - * We can't use the path we already have since we won't have the - * proper locking for a delete, so release the path and - * re-lookup to delete the thing. - */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { - /* Shouldn't happen but just in case... */ - btrfs_release_path(path); - goto again; - } - - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; - - /* - * We have a value to set, so go back and try to insert it now. - */ - if (value) { - btrfs_release_path(path); - goto again; - } - } -out: - btrfs_free_path(path); - return ret; -} - -/* - * @value: "" makes the attribute to empty, NULL removes it - */ -int __btrfs_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - if (trans) - return do_setxattr(trans, inode, name, value, size, flags); - - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = do_setxattr(trans, inode, name, value, size, flags); - if (ret) - goto out; - - inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); -out: - btrfs_end_transaction(trans, root); - return ret; -} - -ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct btrfs_key key, found_key; - struct inode *inode = dentry->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct extent_buffer *leaf; - struct btrfs_dir_item *di; - int ret = 0, slot; - size_t total_size = 0, size_left = size; - unsigned long name_ptr; - size_t name_len; - - /* - * ok we want all objects associated with this id. - * NOTE: we set key.offset = 0; because we want to start with the - * first xattr that we find and walk forward - */ - key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); - key.offset = 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - - /* search for our xattrs */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { - leaf = path->nodes[0]; - slot = path->slots[0]; - - /* this is where we start walking through the path */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * if we've reached the last slot in this leaf we need - * to go to the next leaf and reset everything - */ - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); - - /* check to make sure this item is what we want */ - if (found_key.objectid != key.objectid) - break; - if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) - break; - - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(root, leaf, di)) - continue; - - name_len = btrfs_dir_name_len(leaf, di); - total_size += name_len + 1; - - /* we are just looking for how big our buffer needs to be */ - if (!size) - goto next; - - if (!buffer || (name_len + 1) > size_left) { - ret = -ERANGE; - goto err; - } - - name_ptr = (unsigned long)(di + 1); - read_extent_buffer(leaf, buffer, name_ptr, name_len); - buffer[name_len] = '\0'; - - size_left -= name_len + 1; - buffer += name_len + 1; -next: - path->slots[0]++; - } - ret = total_size; - -err: - btrfs_free_path(path); - - return ret; -} - -/* - * List of handlers for synthetic system.* attributes. All real ondisk - * attributes are handled directly. - */ -const struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - &btrfs_xattr_acl_access_handler, - &btrfs_xattr_acl_default_handler, -#endif - NULL, -}; - -/* - * Check if the attribute is in a supported namespace. - * - * This applied after the check for the synthetic attributes in the system - * namespace. - */ -static bool btrfs_is_valid_xattr(const char *name) -{ - return !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || - !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || - !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); -} - -ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); - - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - return __btrfs_getxattr(dentry->d_inode, name, buffer, size); -} - -int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; - - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ - if (btrfs_root_readonly(root)) - return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, - flags); -} - -int btrfs_removexattr(struct dentry *dentry, const char *name) -{ - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; - - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ - if (btrfs_root_readonly(root)) - return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); - - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - - return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, - XATTR_REPLACE); -} - -int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) -{ - const struct xattr *xattr; - struct btrfs_trans_handle *trans = fs_info; - char *name; - int err = 0; - - for (xattr = xattr_array; xattr->name != NULL; xattr++) { - name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_NOFS); - if (!name) { - err = -ENOMEM; - break; - } - strcpy(name, XATTR_SECURITY_PREFIX); - strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); - err = __btrfs_setxattr(trans, inode, name, - xattr->value, xattr->value_len, 0); - kfree(name); - if (err < 0) - break; - } - return err; -} - -int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) -{ - return security_inode_init_security(inode, dir, qstr, - &btrfs_initxattrs, trans); -} diff --git a/ANDROID_3.4.5/fs/btrfs/xattr.h b/ANDROID_3.4.5/fs/btrfs/xattr.h deleted file mode 100644 index b3cc8039..00000000 --- a/ANDROID_3.4.5/fs/btrfs/xattr.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2007 Red Hat. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __XATTR__ -#define __XATTR__ - -#include <linux/xattr.h> - -extern const struct xattr_handler btrfs_xattr_acl_access_handler; -extern const struct xattr_handler btrfs_xattr_acl_default_handler; -extern const struct xattr_handler *btrfs_xattr_handlers[]; - -extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, - void *buffer, size_t size); -extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, - struct inode *inode, const char *name, - const void *value, size_t size, int flags); -extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size); -extern int btrfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -extern int btrfs_removexattr(struct dentry *dentry, const char *name); - -extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr); - -#endif /* __XATTR__ */ diff --git a/ANDROID_3.4.5/fs/btrfs/zlib.c b/ANDROID_3.4.5/fs/btrfs/zlib.c deleted file mode 100644 index 92c20654..00000000 --- a/ANDROID_3.4.5/fs/btrfs/zlib.c +++ /dev/null @@ -1,399 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - * Based on jffs2 zlib code: - * Copyright © 2001-2007 Red Hat, Inc. - * Created by David Woodhouse <dwmw2@infradead.org> - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/zlib.h> -#include <linux/zutil.h> -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/err.h> -#include <linux/sched.h> -#include <linux/pagemap.h> -#include <linux/bio.h> -#include "compression.h" - -struct workspace { - z_stream inf_strm; - z_stream def_strm; - char *buf; - struct list_head list; -}; - -static void zlib_free_workspace(struct list_head *ws) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); -} - -static struct list_head *zlib_alloc_workspace(void) -{ - struct workspace *workspace; - - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) - return ERR_PTR(-ENOMEM); - - workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( - MAX_WBITS, MAX_MEM_LEVEL)); - workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); - workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->def_strm.workspace || - !workspace->inf_strm.workspace || !workspace->buf) - goto fail; - - INIT_LIST_HEAD(&workspace->list); - - return &workspace->list; -fail: - zlib_free_workspace(&workspace->list); - return ERR_PTR(-ENOMEM); -} - -static int zlib_compress_pages(struct list_head *ws, - struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret; - char *data_in; - char *cpage_out; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; - unsigned long bytes_left; - - *out_pages = 0; - *total_out = 0; - *total_in = 0; - - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { - printk(KERN_WARNING "deflateInit failed\n"); - ret = -1; - goto out; - } - - workspace->def_strm.total_in = 0; - workspace->def_strm.total_out = 0; - - in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); - data_in = kmap(in_page); - - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (out_page == NULL) { - ret = -1; - goto out; - } - cpage_out = kmap(out_page); - pages[0] = out_page; - nr_pages = 1; - - workspace->def_strm.next_in = data_in; - workspace->def_strm.next_out = cpage_out; - workspace->def_strm.avail_out = PAGE_CACHE_SIZE; - workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); - - while (workspace->def_strm.total_in < len) { - ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); - if (ret != Z_OK) { - printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", - ret); - zlib_deflateEnd(&workspace->def_strm); - ret = -1; - goto out; - } - - /* we're making it bigger, give up */ - if (workspace->def_strm.total_in > 8192 && - workspace->def_strm.total_in < - workspace->def_strm.total_out) { - ret = -1; - goto out; - } - /* we need another page for writing out. Test this - * before the total_in so we will pull in a new page for - * the stream end if required - */ - if (workspace->def_strm.avail_out == 0) { - kunmap(out_page); - if (nr_pages == nr_dest_pages) { - out_page = NULL; - ret = -1; - goto out; - } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (out_page == NULL) { - ret = -1; - goto out; - } - cpage_out = kmap(out_page); - pages[nr_pages] = out_page; - nr_pages++; - workspace->def_strm.avail_out = PAGE_CACHE_SIZE; - workspace->def_strm.next_out = cpage_out; - } - /* we're all done */ - if (workspace->def_strm.total_in >= len) - break; - - /* we've read in a full page, get a new one */ - if (workspace->def_strm.avail_in == 0) { - if (workspace->def_strm.total_out > max_out) - break; - - bytes_left = len - workspace->def_strm.total_in; - kunmap(in_page); - page_cache_release(in_page); - - start += PAGE_CACHE_SIZE; - in_page = find_get_page(mapping, - start >> PAGE_CACHE_SHIFT); - data_in = kmap(in_page); - workspace->def_strm.avail_in = min(bytes_left, - PAGE_CACHE_SIZE); - workspace->def_strm.next_in = data_in; - } - } - workspace->def_strm.avail_in = 0; - ret = zlib_deflate(&workspace->def_strm, Z_FINISH); - zlib_deflateEnd(&workspace->def_strm); - - if (ret != Z_STREAM_END) { - ret = -1; - goto out; - } - - if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { - ret = -1; - goto out; - } - - ret = 0; - *total_out = workspace->def_strm.total_out; - *total_in = workspace->def_strm.total_in; -out: - *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); - page_cache_release(in_page); - } - return ret; -} - -static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret = 0, ret2; - int wbits = MAX_WBITS; - char *data_in; - size_t total_out = 0; - unsigned long page_in_index = 0; - unsigned long page_out_index = 0; - unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; - unsigned long buf_start; - unsigned long pg_offset; - - data_in = kmap(pages_in[page_in_index]); - workspace->inf_strm.next_in = data_in; - workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); - workspace->inf_strm.total_in = 0; - - workspace->inf_strm.total_out = 0; - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - pg_offset = 0; - - /* If it's deflate, and it's got no preset dictionary, then - we can tell zlib to skip the adler32 check. */ - if (srclen > 2 && !(data_in[1] & PRESET_DICT) && - ((data_in[0] & 0x0f) == Z_DEFLATED) && - !(((data_in[0]<<8) + data_in[1]) % 31)) { - - wbits = -((data_in[0] >> 4) + 8); - workspace->inf_strm.next_in += 2; - workspace->inf_strm.avail_in -= 2; - } - - if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); - return -1; - } - while (workspace->inf_strm.total_in < srclen) { - ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->inf_strm.total_out; - - /* we didn't make progress in this inflate call, we're done */ - if (buf_start == total_out) - break; - - ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - total_out, disk_start, - bvec, vcnt, - &page_out_index, &pg_offset); - if (ret2 == 0) { - ret = 0; - goto done; - } - - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - - if (workspace->inf_strm.avail_in == 0) { - unsigned long tmp; - kunmap(pages_in[page_in_index]); - page_in_index++; - if (page_in_index >= total_pages_in) { - data_in = NULL; - break; - } - data_in = kmap(pages_in[page_in_index]); - workspace->inf_strm.next_in = data_in; - tmp = srclen - workspace->inf_strm.total_in; - workspace->inf_strm.avail_in = min(tmp, - PAGE_CACHE_SIZE); - } - } - if (ret != Z_STREAM_END) - ret = -1; - else - ret = 0; -done: - zlib_inflateEnd(&workspace->inf_strm); - if (data_in) - kunmap(pages_in[page_in_index]); - return ret; -} - -static int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) -{ - struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret = 0; - int wbits = MAX_WBITS; - unsigned long bytes_left = destlen; - unsigned long total_out = 0; - char *kaddr; - - workspace->inf_strm.next_in = data_in; - workspace->inf_strm.avail_in = srclen; - workspace->inf_strm.total_in = 0; - - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - workspace->inf_strm.total_out = 0; - /* If it's deflate, and it's got no preset dictionary, then - we can tell zlib to skip the adler32 check. */ - if (srclen > 2 && !(data_in[1] & PRESET_DICT) && - ((data_in[0] & 0x0f) == Z_DEFLATED) && - !(((data_in[0]<<8) + data_in[1]) % 31)) { - - wbits = -((data_in[0] >> 4) + 8); - workspace->inf_strm.next_in += 2; - workspace->inf_strm.avail_in -= 2; - } - - if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { - printk(KERN_WARNING "inflateInit failed\n"); - return -1; - } - - while (bytes_left > 0) { - unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - unsigned long pg_offset = 0; - - ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); - if (ret != Z_OK && ret != Z_STREAM_END) - break; - - buf_start = total_out; - total_out = workspace->inf_strm.total_out; - - if (total_out == buf_start) { - ret = -1; - break; - } - - if (total_out <= start_byte) - goto next; - - if (total_out > start_byte && buf_start < start_byte) - buf_offset = start_byte - buf_start; - else - buf_offset = 0; - - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, bytes_left); - - kaddr = kmap_atomic(dest_page); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); - kunmap_atomic(kaddr); - - pg_offset += bytes; - bytes_left -= bytes; -next: - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - } - - if (ret != Z_STREAM_END && bytes_left != 0) - ret = -1; - else - ret = 0; - - zlib_inflateEnd(&workspace->inf_strm); - return ret; -} - -struct btrfs_compress_op btrfs_zlib_compress = { - .alloc_workspace = zlib_alloc_workspace, - .free_workspace = zlib_free_workspace, - .compress_pages = zlib_compress_pages, - .decompress_biovec = zlib_decompress_biovec, - .decompress = zlib_decompress, -}; |