From 871480933a1c28f8a9fed4c4d34d06c439a7a422 Mon Sep 17 00:00:00 2001 From: Srikant Patnaik Date: Sun, 11 Jan 2015 12:28:04 +0530 Subject: Moved, renamed, and deleted files The original directory structure was scattered and unorganized. Changes are basically to make it look like kernel structure. --- fs/ext4/Kconfig | 85 + fs/ext4/Makefile | 14 + fs/ext4/acl.c | 439 ++++ fs/ext4/acl.h | 77 + fs/ext4/balloc.c | 766 +++++++ fs/ext4/bitmap.c | 31 + fs/ext4/block_validity.c | 268 +++ fs/ext4/dir.c | 667 ++++++ fs/ext4/ext4.h | 2372 ++++++++++++++++++++++ fs/ext4/ext4_extents.h | 296 +++ fs/ext4/ext4_jbd2.c | 154 ++ fs/ext4/ext4_jbd2.h | 399 ++++ fs/ext4/extents.c | 4866 ++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/file.c | 262 +++ fs/ext4/fsync.c | 271 +++ fs/ext4/hash.c | 208 ++ fs/ext4/ialloc.c | 1161 +++++++++++ fs/ext4/indirect.c | 1502 ++++++++++++++ fs/ext4/inode.c | 4676 ++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ioctl.c | 509 +++++ fs/ext4/mballoc.c | 5047 ++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/mballoc.h | 222 ++ fs/ext4/migrate.c | 604 ++++++ fs/ext4/mmp.c | 353 ++++ fs/ext4/move_extent.c | 1423 +++++++++++++ fs/ext4/namei.c | 2607 ++++++++++++++++++++++++ fs/ext4/page-io.c | 433 ++++ fs/ext4/resize.c | 1689 ++++++++++++++++ fs/ext4/super.c | 4980 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/symlink.c | 56 + fs/ext4/truncate.h | 43 + fs/ext4/xattr.c | 1608 +++++++++++++++ fs/ext4/xattr.h | 155 ++ fs/ext4/xattr_security.c | 82 + fs/ext4/xattr_trusted.c | 58 + fs/ext4/xattr_user.c | 61 + 36 files changed, 38444 insertions(+) create mode 100644 fs/ext4/Kconfig create mode 100644 fs/ext4/Makefile create mode 100644 fs/ext4/acl.c create mode 100644 fs/ext4/acl.h create mode 100644 fs/ext4/balloc.c create mode 100644 fs/ext4/bitmap.c create mode 100644 fs/ext4/block_validity.c create mode 100644 fs/ext4/dir.c create mode 100644 fs/ext4/ext4.h create mode 100644 fs/ext4/ext4_extents.h create mode 100644 fs/ext4/ext4_jbd2.c create mode 100644 fs/ext4/ext4_jbd2.h create mode 100644 fs/ext4/extents.c create mode 100644 fs/ext4/file.c create mode 100644 fs/ext4/fsync.c create mode 100644 fs/ext4/hash.c create mode 100644 fs/ext4/ialloc.c create mode 100644 fs/ext4/indirect.c create mode 100644 fs/ext4/inode.c create mode 100644 fs/ext4/ioctl.c create mode 100644 fs/ext4/mballoc.c create mode 100644 fs/ext4/mballoc.h create mode 100644 fs/ext4/migrate.c create mode 100644 fs/ext4/mmp.c create mode 100644 fs/ext4/move_extent.c create mode 100644 fs/ext4/namei.c create mode 100644 fs/ext4/page-io.c create mode 100644 fs/ext4/resize.c create mode 100644 fs/ext4/super.c create mode 100644 fs/ext4/symlink.c create mode 100644 fs/ext4/truncate.h create mode 100644 fs/ext4/xattr.c create mode 100644 fs/ext4/xattr.h create mode 100644 fs/ext4/xattr_security.c create mode 100644 fs/ext4/xattr_trusted.c create mode 100644 fs/ext4/xattr_user.c (limited to 'fs/ext4') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig new file mode 100644 index 00000000..9ed1bb1f --- /dev/null +++ b/fs/ext4/Kconfig @@ -0,0 +1,85 @@ +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" + select JBD2 + select CRC16 + help + This is the next generation of the ext3 filesystem. + + Unlike the change from ext2 filesystem to ext3 filesystem, + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formatting a new filesystem as an ext4 + filesystem initially. + + To compile this file system support as a module, choose M here. The + module will be called ext4. + + If unsure, say N. + +config EXT4_USE_FOR_EXT23 + bool "Use ext4 for ext2/ext3 file systems" + depends on EXT4_FS + depends on EXT3_FS=n || EXT2_FS=n + default y + help + Allow the ext4 file system driver code to be used for ext2 or + ext3 file system mounts. This allows users to reduce their + compiled kernel size by using one file system driver for + ext2, ext3, and ext4 file systems. + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext4. + +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext4 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT4_DEBUG + bool "EXT4 debugging support" + depends on EXT4_FS + help + Enables run-time debugging support for the ext4 filesystem. + + If you select Y here, then you will be able to turn on debugging + with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile new file mode 100644 index 00000000..56fd8f86 --- /dev/null +++ b/fs/ext4/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for the linux ext4-filesystem routines. +# + +obj-$(CONFIG_EXT4_FS) += ext4.o + +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ + mmp.o indirect.o + +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c new file mode 100644 index 00000000..a5c29bb3 --- /dev/null +++ b/fs/ext4/acl.c @@ -0,0 +1,439 @@ +/* + * linux/fs/ext4/acl.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + */ + +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +ext4_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(ext4_acl_header)) + return ERR_PTR(-EINVAL); + if (((ext4_acl_header *)value)->a_version != + cpu_to_le32(EXT4_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(ext4_acl_header); + count = ext4_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + ext4_acl_entry *entry = + (ext4_acl_entry *)value; + if ((char *)value + sizeof(ext4_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + ext4_acl_header *ext_acl; + char *e; + size_t n; + + *size = ext4_acl_size(acl->a_count); + ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * + sizeof(ext4_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); + e = (char *)ext_acl + sizeof(ext4_acl_header); + for (n = 0; n < acl->a_count; n++) { + ext4_acl_entry *entry = (ext4_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_mutex: don't care + */ +struct posix_acl * +ext4_get_acl(struct inode *inode, int type) +{ + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!test_opt(inode->i_sb, POSIX_ACL)) + return NULL; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + BUG(); + } + retval = ext4_xattr_get(inode, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ext4_xattr_get(inode, name_index, "", value, retval); + } + if (retval > 0) + acl = ext4_acl_from_disk(value, retval); + else if (retval == -ENODATA || retval == -ENOSYS) + acl = NULL; + else + acl = ERR_PTR(retval); + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + * + * inode->i_mutex: down unless called from ext4_new_inode + */ +static int +ext4_set_acl(handle_t *handle, struct inode *inode, int type, + struct posix_acl *acl) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return error; + else { + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + if (acl) { + value = ext4_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + error = ext4_xattr_set_handle(handle, inode, name_index, "", + value, size, 0); + + kfree(value); + if (!error) + set_cached_acl(inode, type, acl); + + return error; +} + +/* + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) + */ +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (test_opt(dir->i_sb, POSIX_ACL)) { + acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current_umask(); + } + if (test_opt(inode->i_sb, POSIX_ACL) && acl) { + if (S_ISDIR(inode->i_mode)) { + error = ext4_set_acl(handle, inode, + ACL_TYPE_DEFAULT, acl); + if (error) + goto cleanup; + } + error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (error < 0) + return error; + + if (error > 0) { + /* This is an extended ACL */ + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + } + } +cleanup: + posix_acl_release(acl); + return error; +} + +/* + * Does chmod for an inode that may have an Access Control List. The + * inode->i_mode field must be updated to the desired value by the caller + * before calling this function. + * Returns 0 on success, or a negative error number. + * + * We change the ACL rather than storing some ACL entries in the file + * mode permission bits (which would be more efficient), because that + * would break once additional permissions (like ACL_APPEND, ACL_DELETE + * for directories) are added. There are no more bits available in the + * file mode. + * + * inode->i_mutex: down + */ +int +ext4_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl; + handle_t *handle; + int retries = 0; + int error; + + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return 0; + acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (error) + return error; +retry: + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + ext4_std_error(inode->i_sb, error); + goto out; + } + error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + posix_acl_release(acl); + return error; +} + +/* + * Extended attribute handlers + */ +static size_t +ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t +ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, + const char *name, size_t name_len, int type) +{ + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return 0; + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int +ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ext4_get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +static int +ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + handle_t *handle; + struct posix_acl *acl; + int error, retries = 0; + + if (strcmp(name, "") != 0) + return -EINVAL; + if (!test_opt(inode->i_sb, POSIX_ACL)) + return -EOPNOTSUPP; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + +retry: + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = ext4_set_acl(handle, inode, type, acl); + ext4_journal_stop(handle); + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + +release_and_out: + posix_acl_release(acl); + return error; +} + +const struct xattr_handler ext4_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = ext4_xattr_list_acl_access, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, +}; + +const struct xattr_handler ext4_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = ext4_xattr_list_acl_default, + .get = ext4_xattr_get_acl, + .set = ext4_xattr_set_acl, +}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h new file mode 100644 index 00000000..18cb39ed --- /dev/null +++ b/fs/ext4/acl.h @@ -0,0 +1,77 @@ +/* + File: fs/ext4/acl.h + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +#define EXT4_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} ext4_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} ext4_acl_entry_short; + +typedef struct { + __le32 a_version; +} ext4_acl_header; + +static inline size_t ext4_acl_size(int count) +{ + if (count <= 4) { + return sizeof(ext4_acl_header) + + count * sizeof(ext4_acl_entry_short); + } else { + return sizeof(ext4_acl_header) + + 4 * sizeof(ext4_acl_entry_short) + + (count - 4) * sizeof(ext4_acl_entry); + } +} + +static inline int ext4_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(ext4_acl_header); + s = size - 4 * sizeof(ext4_acl_entry_short); + if (s < 0) { + if (size % sizeof(ext4_acl_entry_short)) + return -1; + return size / sizeof(ext4_acl_entry_short); + } else { + if (s % sizeof(ext4_acl_entry)) + return -1; + return s / sizeof(ext4_acl_entry) + 4; + } +} + +#ifdef CONFIG_EXT4_FS_POSIX_ACL + +/* acl.c */ +struct posix_acl *ext4_get_acl(struct inode *inode, int type); +extern int ext4_acl_chmod(struct inode *); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); + +#else /* CONFIG_EXT4_FS_POSIX_ACL */ +#include +#define ext4_get_acl NULL + +static inline int +ext4_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ + diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c new file mode 100644 index 00000000..8da837be --- /dev/null +++ b/fs/ext4/balloc.c @@ -0,0 +1,766 @@ +/* + * linux/fs/ext4/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" +#include "mballoc.h" + +#include + +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group); +/* + * balloc.c contains the blocks allocation and deallocation routines + */ + +/* + * Calculate the block group number and offset into the block/cluster + * allocation bitmap, given a block number + */ +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_grpblk_t offset; + + blocknr = blocknr - le32_to_cpu(es->s_first_data_block); + offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> + EXT4_SB(sb)->s_cluster_bits; + if (offsetp) + *offsetp = offset; + if (blockgrpp) + *blockgrpp = blocknr; + +} + +static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, + ext4_group_t block_group) +{ + ext4_group_t actual_group; + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); + if (actual_group == block_group) + return 1; + return 0; +} + +/* Return the number of clusters used for file system metadata; this + * represents the overhead needed by the file system. + */ +unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned num_clusters; + int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); + ext4_fsblk_t itbl_blk; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* This is the number of clusters used by the superblock, + * block group descriptors, and reserved block group + * descriptor blocks */ + num_clusters = ext4_num_base_meta_clusters(sb, block_group); + + /* + * For the allocation bitmaps and inode table, we first need + * to check to see if the block is in the block group. If it + * is, then check to see if the cluster is already accounted + * for in the clusters used for the base metadata cluster, or + * if we can increment the base metadata cluster to include + * that block. Otherwise, we will have to track the cluster + * used for the allocation bitmap or inode table explicitly. + * Normally all of these blocks are contiguous, so the special + * case handling shouldn't be necessary except for *very* + * unusual file system layouts. + */ + if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); + if (block_cluster < num_clusters) + block_cluster = -1; + else if (block_cluster == num_clusters) { + num_clusters++; + block_cluster = -1; + } + } + + if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { + inode_cluster = EXT4_B2C(sbi, + ext4_inode_bitmap(sb, gdp) - start); + if (inode_cluster < num_clusters) + inode_cluster = -1; + else if (inode_cluster == num_clusters) { + num_clusters++; + inode_cluster = -1; + } + } + + itbl_blk = ext4_inode_table(sb, gdp); + for (i = 0; i < sbi->s_itb_per_group; i++) { + if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { + c = EXT4_B2C(sbi, itbl_blk + i - start); + if ((c < num_clusters) || (c == inode_cluster) || + (c == block_cluster) || (c == itbl_cluster)) + continue; + if (c == num_clusters) { + num_clusters++; + continue; + } + num_clusters++; + itbl_cluster = c; + } + } + + if (block_cluster != -1) + num_clusters++; + if (inode_cluster != -1) + num_clusters++; + + return num_clusters; +} + +static unsigned int num_clusters_in_group(struct super_block *sb, + ext4_group_t block_group) +{ + unsigned int blocks; + + if (block_group == ext4_get_groups_count(sb) - 1) { + /* + * Even though mke2fs always initializes the first and + * last group, just in case some other tool was used, + * we need to make sure we calculate the right free + * blocks. + */ + blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, block_group); + } else + blocks = EXT4_BLOCKS_PER_GROUP(sb); + return EXT4_NUM_B2C(EXT4_SB(sb), blocks); +} + +/* Initializes an uninitialized block bitmap */ +void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + unsigned int bit, bit_max; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + int flex_bg = 0; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + ext4_free_group_clusters_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return; + } + memset(bh->b_data, 0, sb->s_blocksize); + + bit_max = ext4_num_base_meta_clusters(sb, block_group); + for (bit = 0; bit < bit_max; bit++) + ext4_set_bit(bit, bh->b_data); + + start = ext4_group_first_block_no(sb, block_group); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flex_bg = 1; + + /* Set bits for block and inode bitmaps, and inode table */ + tmp = ext4_block_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_bitmap(sb, gdp); + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + + tmp = ext4_inode_table(sb, gdp); + for (; tmp < ext4_inode_table(sb, gdp) + + sbi->s_itb_per_group; tmp++) { + if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) + ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); + } + + /* + * Also if the number of blocks within the group is less than + * the blocksize * 8 ( which is the size of bitmap ), set rest + * of the block bitmap to 1 + */ + ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), + sb->s_blocksize * 8, bh->b_data); +} + +/* Return the number of free blocks in a block group. It is used when + * the block bitmap is uninitialized, so we can't just count the bits + * in the bitmap. */ +unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + return num_clusters_in_group(sb, block_group) - + ext4_num_overhead_clusters(sb, block_group, gdp); +} + +/* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext4_fill_super). + */ + +/** + * ext4_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head **bh) +{ + unsigned int group_desc; + unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (block_group >= ngroups) { + ext4_error(sb, "block_group >= groups_count - block_group = %u," + " groups_count = %u", block_group, ngroups); + + return NULL; + } + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); + if (!sbi->s_group_desc[group_desc]) { + ext4_error(sb, "Group descriptor not loaded - " + "block_group = %u, group_desc = %u, desc = %u", + block_group, group_desc, offset); + return NULL; + } + + desc = (struct ext4_group_desc *)( + (__u8 *)sbi->s_group_desc[group_desc]->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) + *bh = sbi->s_group_desc[group_desc]; + return desc; +} + +static int ext4_valid_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t offset; + ext4_grpblk_t next_zero_bit; + ext4_fsblk_t bitmap_blk; + ext4_fsblk_t group_first_block; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + /* with FLEX_BG, the inode/block bitmaps and itable + * blocks may not be in the group at all + * so the bitmap validation will be skipped for those groups + * or it has to also read the block group where the bitmaps + * are located to verify they are set. + */ + return 1; + } + group_first_block = ext4_group_first_block_no(sb, block_group); + + /* check whether block bitmap block number is set */ + bitmap_blk = ext4_block_bitmap(sb, desc); + offset = bitmap_blk - group_first_block; + if (!ext4_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode bitmap block number is set */ + bitmap_blk = ext4_inode_bitmap(sb, desc); + offset = bitmap_blk - group_first_block; + if (!ext4_test_bit(offset, bh->b_data)) + /* bad block bitmap */ + goto err_out; + + /* check whether the inode table block number is set */ + bitmap_blk = ext4_inode_table(sb, desc); + offset = bitmap_blk - group_first_block; + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, + offset + EXT4_SB(sb)->s_itb_per_group, + offset); + if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) + /* good bitmap for inode tables */ + return 1; + +err_out: + ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", + block_group, bitmap_blk); + return 0; +} +/** + * ext4_read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group,and validate the + * bits for block/inode/inode tables are set in the bitmaps + * + * Return buffer_head on success or NULL in case of failure. + */ +struct buffer_head * +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + bitmap_blk = ext4_block_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, "Cannot get buffer for block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } + ext4_lock_group(sb, block_group); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for reading + */ + set_buffer_new(bh); + trace_ext4_read_block_bitmap_load(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + return bh; +} + +/* Returns 0 on success, 1 on error */ +int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, + struct buffer_head *bh) +{ + struct ext4_group_desc *desc; + + if (!buffer_new(bh)) + return 0; + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return 1; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + ext4_error(sb, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); + return 1; + } + clear_buffer_new(bh); + /* Panic or remount fs read-only if block bitmap is invalid */ + ext4_valid_block_bitmap(sb, desc, block_group, bh); + return 0; +} + +struct buffer_head * +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct buffer_head *bh; + + bh = ext4_read_block_bitmap_nowait(sb, block_group); + if (ext4_wait_block_bitmap(sb, block_group, bh)) { + put_bh(bh); + return NULL; + } + return bh; +} + +/** + * ext4_has_free_clusters() + * @sbi: in-core super block structure. + * @nclusters: number of needed blocks + * @flags: flags from ext4_mb_new_blocks() + * + * Check if filesystem has nclusters free & available for allocation. + * On success return 1, return 0 on failure. + */ +static int ext4_has_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + s64 free_clusters, dirty_clusters, root_clusters; + struct percpu_counter *fcc = &sbi->s_freeclusters_counter; + struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; + + free_clusters = percpu_counter_read_positive(fcc); + dirty_clusters = percpu_counter_read_positive(dcc); + root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); + + if (free_clusters - (nclusters + root_clusters + dirty_clusters) < + EXT4_FREECLUSTERS_WATERMARK) { + free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); + dirty_clusters = percpu_counter_sum_positive(dcc); + } + /* Check whether we have space after accounting for current + * dirty clusters & root reserved clusters. + */ + if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) + return 1; + + /* Hm, nope. Are (enough) root reserved clusters available? */ + if (sbi->s_resuid == current_fsuid() || + ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || + capable(CAP_SYS_RESOURCE) || + (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + + if (free_clusters >= (nclusters + dirty_clusters)) + return 1; + } + + return 0; +} + +int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags) +{ + if (ext4_has_free_clusters(sbi, nclusters, flags)) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); + return 0; + } else + return -ENOSPC; +} + +/** + * ext4_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * + * ext4_should_retry_alloc() is called when ENOSPC is returned, and if + * it is profitable to retry the operation, this function will wait + * for the current or committing transaction to complete, and then + * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. + */ +int ext4_should_retry_alloc(struct super_block *sb, int *retries) +{ + if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || + (*retries)++ > 3 || + !EXT4_SB(sb)->s_journal) + return 0; + + jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + + return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); +} + +/* + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: pointer to total number of clusters needed + * @errp: error code + * + * Return 1st allocated block number on success, *count stores total account + * error stores in errp pointer + */ +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned int flags, + unsigned long *count, int *errp) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + ar.flags = flags; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* + * Account for the allocated meta blocks. We will never + * fail EDQUOT for metdata, but we do account for it. + */ + if (!(*errp) && + ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += ar.len; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + dquot_alloc_block_nofail(inode, + EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); + } + return ret; +} + +/** + * ext4_count_free_clusters() -- count filesystem free clusters + * @sb: superblock + * + * Adds up the number of free clusters from each block group. + */ +ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) +{ + ext4_fsblk_t desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i; + ext4_group_t ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + ext4_fsblk_t bitmap_count; + unsigned int x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_group_clusters(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_block_bitmap(sb, i); + if (bitmap_bh == NULL) + continue; + + x = ext4_count_free(bitmap_bh, sb->s_blocksize); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_group_clusters(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" + ", computed = %llu, %llu\n", + EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), + desc_count, bitmap_count); + return bitmap_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_group_clusters(sb, gdp); + } + + return desc_count; +#endif +} + +static inline int test_root(ext4_group_t a, int b) +{ + int num = b; + + while (a > num) + num *= b; + return num == a; +} + +static int ext4_group_sparse(ext4_group_t group) +{ + if (group <= 1) + return 1; + if (!(group & 1)) + return 0; + return (test_root(group, 7) || test_root(group, 5) || + test_root(group, 3)); +} + +/** + * ext4_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ +int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && + !ext4_group_sparse(group)) + return 0; + return 1; +} + +static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, + ext4_group_t group) +{ + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); + ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; + + if (group == first || group == first + 1 || group == last) + return 1; + return 0; +} + +static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) +{ + if (!ext4_bg_has_super(sb, group)) + return 0; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; +} + +/** + * ext4_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ +unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) +{ + unsigned long first_meta_bg = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || + metagroup < first_meta_bg) + return ext4_bg_num_gdb_nometa(sb, group); + + return ext4_bg_num_gdb_meta(sb,group); + +} + +/* + * This function returns the number of file system metadata clusters at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned num; + + /* Check for superblock and gdt backups in this group */ + num = ext4_bg_has_super(sb, block_group); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * + sbi->s_desc_per_block) { + if (num) { + num += ext4_bg_num_gdb(sb, block_group); + num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + } + } else { /* For META_BG_BLOCK_GROUPS */ + num += ext4_bg_num_gdb(sb, block_group); + } + return EXT4_NUM_B2C(sbi, num); +} +/** + * ext4_inode_to_goal_block - return a hint for block allocation + * @inode: inode for block allocation + * + * Return the ideal location to start allocating blocks for a + * newly created inode. + */ +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + ext4_group_t block_group; + ext4_grpblk_t colour; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); + ext4_fsblk_t bg_start; + ext4_fsblk_t last_block; + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = ext4_group_first_block_no(inode->i_sb, block_group); + last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) + colour = (current->pid % 16) * + (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); + else + colour = (current->pid % 16) * ((last_block - bg_start) / 16); + return bg_start + colour; +} + diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c new file mode 100644 index 00000000..fa3af81a --- /dev/null +++ b/fs/ext4/bitmap.c @@ -0,0 +1,31 @@ +/* + * linux/fs/ext4/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include "ext4.h" + +#ifdef EXT4FS_DEBUG + +static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + +unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) +{ + unsigned int i, sum = 0; + + if (!map) + return 0; + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return sum; +} + +#endif /* EXT4FS_DEBUG */ + diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c new file mode 100644 index 00000000..3f11656b --- /dev/null +++ b/fs/ext4/block_validity.c @@ -0,0 +1,268 @@ +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init ext4_init_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_system_zone(void) +{ + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *new_entry = NULL, *entry; + struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node *parent = NULL, *new_node = NULL; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + if (start_blk + count > (entry->start_blk + + entry->count)) + entry->count = (start_blk + count - + entry->start_blk); + new_node = *n; + new_entry = rb_entry(new_node, struct ext4_system_zone, + node); + break; + } + } + + if (!new_entry) { + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); + } + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + int first = 1; + + printk(KERN_INFO "System zones: "); + node = rb_first(&sbi->system_blks); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk("%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + printk("\n"); +} + +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_group_t i; + int flex_size = ext4_flex_bg_size(sbi); + int ret; + + if (!test_opt(sb, BLOCK_VALIDITY)) { + if (EXT4_SB(sb)->system_blks.rb_node) + ext4_release_system_zone(sb); + return 0; + } + if (EXT4_SB(sb)->system_blks.rb_node) + return 0; + + for (i=0; i < ngroups; i++) { + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), + ext4_bg_num_gdb(sb, i) + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + sbi->s_itb_per_group); + if (ret) + return ret; + } + + if (test_opt(sb, DEBUG)) + debug_print_tree(EXT4_SB(sb)); + return 0; +} + +/* Called when the filesystem is unmounted */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; + struct rb_node *parent; + struct ext4_system_zone *entry; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + entry = rb_entry(n, struct ext4_system_zone, node); + kmem_cache_free(ext4_system_zone_cachep, entry); + if (!parent) + EXT4_SB(sb)->system_blks = RB_ROOT; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + EXT4_SB(sb)->system_blks = RB_ROOT; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n = sbi->system_blks.rb_node; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + +int ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) +{ + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + __le32 *bref = p; + unsigned int blk; + + while (bref < p+max) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); + return -EIO; + } + } + return 0; +} + diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c new file mode 100644 index 00000000..b8678620 --- /dev/null +++ b/fs/ext4/dir.c @@ -0,0 +1,667 @@ +/* + * linux/fs/ext4/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Hash Tree Directory indexing (c) 2001 Daniel Phillips + * + */ + +#include +#include +#include +#include +#include +#include "ext4.h" + +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir); + +static unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return (ext4_filetype_table[filetype]); +} + +/** + * Check if the given dir-inode refers to an htree-indexed directory + * (or a directory which chould potentially get coverted to use htree + * indexing). + * + * Return 1 if it is a dx dir, 0 if not + */ +static int is_dx_dir(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) + return 1; + + return 0; +} + +/* + * Return 0 if the directory entry is OK, and 1 if there is a problem + * + * Note: this is the opposite of what ext2 and ext3 historically returned... + */ +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, struct file *filp, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned int offset) +{ + const char *error_msg = NULL; + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + + if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; + else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(((char *) de - bh->b_data) + rlen > + dir->i_sb->s_blocksize)) + error_msg = "directory entry across blocks"; + else if (unlikely(le32_to_cpu(de->inode) > + le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) + error_msg = "inode out of bounds"; + else + return 0; + + if (filp) + ext4_error_file(filp, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset % bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + else + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - offset=%u(%u), " + "inode=%u, rec_len=%d, name_len=%d", + error_msg, (unsigned) (offset % bh->b_size), + offset, le32_to_cpu(de->inode), + rlen, de->name_len); + + return 1; +} + +static int ext4_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + int error = 0; + unsigned int offset; + int i, stored; + struct ext4_dir_entry_2 *de; + int err; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + int ret = 0; + int dir_has_error = 0; + + if (is_dx_dir(inode)) { + err = ext4_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { + ret = err; + goto out; + } + /* + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + EXT4_INODE_INDEX); + } + stored = 0; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + struct ext4_map_blocks map; + struct buffer_head *bh = NULL; + + map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); + map.m_len = 1; + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err > 0) { + pgoff_t index = map.m_pblk >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (!ra_has_index(&filp->f_ra, index)) + page_cache_sync_readahead( + sb->s_bdev->bd_inode->i_mapping, + &filp->f_ra, filp, + index, 1); + filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; + bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); + } + + /* + * We ignore I/O errors on directories so users have a chance + * of recovering data when there's a bad sector + */ + if (!bh) { + if (!dir_has_error) { + EXT4_ERROR_FILE(filp, 0, + "directory contains a " + "hole at offset %llu", + (unsigned long long) filp->f_pos); + dir_has_error = 1; + } + /* corrupt size? Maybe no more blocks to read */ + if (filp->f_pos > inode->i_blocks << 9) + break; + filp->f_pos += sb->s_blocksize - offset; + continue; + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext4_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); + if (ext4_check_dir_entry(inode, filp, de, + bh, offset)) { + /* + * On error, skip the f_pos to the next block + */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse(bh); + ret = stored; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored++; + } + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); + } + offset = 0; + brelse(bh); + } +out: + return ret; +} + +static inline int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return is_compat_task(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +/* + * These functions convert from the major/minor hash to an f_pos + * value for dx directories + * + * Upper layer (for example NFS) should specify FMODE_32BITHASH or + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted + * directly on both 32-bit and 64-bit nodes, under such case, neither + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. + */ +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return major >> 1; + else + return ((__u64)(major >> 1) << 32) | (__u64)minor; +} + +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return (pos << 1) & 0xffffffff; + else + return ((pos >> 32) << 1) & 0xffffffff; +} + +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return 0; + else + return pos & 0xffffffff; +} + +/* + * Return 32- or 64-bit end-of-file for dx directories + */ +static inline loff_t ext4_get_htree_eof(struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return EXT4_HTREE_EOF_32BIT; + else + return EXT4_HTREE_EOF_64BIT; +} + + +/* + * ext4_dir_llseek() based on generic_file_llseek() to handle both + * non-htree and htree directories, where the "offset" is in terms + * of the filename hash value instead of the byte offset. + * + * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX) + * will be invalid once the directory was converted into a dx directory + */ +loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t ret = -EINVAL; + int dx_dir = is_dx_dir(inode); + + mutex_lock(&inode->i_mutex); + + /* NOTE: relative offsets with dx directories might not work + * as expected, as it is difficult to figure out the + * correct offset between dx hashes */ + + switch (origin) { + case SEEK_END: + if (unlikely(offset > 0)) + goto out_err; /* not supported for directories */ + + /* so only negative offsets are left, does that have a + * meaning for directories at all? */ + if (dx_dir) + offset += ext4_get_htree_eof(file); + else + offset += inode->i_size; + break; + case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) { + offset = file->f_pos; + goto out_ok; + } + + offset += file->f_pos; + break; + } + + if (unlikely(offset < 0)) + goto out_err; + + if (!dx_dir) { + if (offset > inode->i_sb->s_maxbytes) + goto out_err; + } else if (offset > ext4_get_htree_eof(file)) + goto out_err; + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out_ok: + ret = offset; +out_err: + mutex_unlock(&inode->i_mutex); + + return ret; +} + +/* + * This structure holds the nodes of the red-black tree used to store + * the directory entry in hash order. + */ +struct fname { + __u32 hash; + __u32 minor_hash; + struct rb_node rb_hash; + struct fname *next; + __u32 inode; + __u8 name_len; + __u8 file_type; + char name[0]; +}; + +/* + * This functoin implements a non-recursive way of freeing all of the + * nodes in the red-black tree. + */ +static void free_rb_tree_fname(struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + struct rb_node *parent; + struct fname *fname; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + fname = rb_entry(n, struct fname, rb_hash); + while (fname) { + struct fname *old = fname; + fname = fname->next; + kfree(old); + } + if (!parent) + *root = RB_ROOT; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } +} + + +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, + loff_t pos) +{ + struct dir_private_info *p; + + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + if (!p) + return NULL; + p->curr_hash = pos2maj_hash(filp, pos); + p->curr_minor_hash = pos2min_hash(filp, pos); + return p; +} + +void ext4_htree_free_dir_info(struct dir_private_info *p) +{ + free_rb_tree_fname(&p->root); + kfree(p); +} + +/* + * Given a directory entry, enter it into the fname rb tree. + */ +int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent) +{ + struct rb_node **p, *parent = NULL; + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = dirent->name_len; + new_fn->file_type = dirent->file_type; + memcpy(new_fn->name, dirent->name, dirent->name_len); + new_fn->name[dirent->name_len] = 0; + + while (*p) { + parent = *p; + fname = rb_entry(parent, struct fname, rb_hash); + + /* + * If the hash and minor hash match up, then we put + * them on a linked list. This rarely happens... + */ + if ((new_fn->hash == fname->hash) && + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; + return 0; + } + + if (new_fn->hash < fname->hash) + p = &(*p)->rb_left; + else if (new_fn->hash > fname->hash) + p = &(*p)->rb_right; + else if (new_fn->minor_hash < fname->minor_hash) + p = &(*p)->rb_left; + else /* if (new_fn->minor_hash > fname->minor_hash) */ + p = &(*p)->rb_right; + } + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); + return 0; +} + + + +/* + * This is a helper function for ext4_dx_readdir. It calls filldir + * for all entres on the fname linked list. (Normally there is only + * one entry on the linked list, unless there are 62 bit hash collisions.) + */ +static int call_filldir(struct file *filp, void *dirent, + filldir_t filldir, struct fname *fname) +{ + struct dir_private_info *info = filp->private_data; + loff_t curr_pos; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb; + int error; + + sb = inode->i_sb; + + if (!fname) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + "called with null fname?!?", __func__, __LINE__, + inode->i_ino, current->comm); + return 0; + } + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); + while (fname) { + error = filldir(dirent, fname->name, + fname->name_len, curr_pos, + fname->inode, + get_dtype(sb, fname->file_type)); + if (error) { + filp->f_pos = curr_pos; + info->extra_fname = fname; + return error; + } + fname = fname->next; + } + return 0; +} + +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir) +{ + struct dir_private_info *info = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + struct fname *fname; + int ret; + + if (!info) { + info = ext4_htree_create_dir_info(filp, filp->f_pos); + if (!info) + return -ENOMEM; + filp->private_data = info; + } + + if (filp->f_pos == ext4_get_htree_eof(filp)) + return 0; /* EOF */ + + /* Some one has messed with f_pos; reset the world */ + if (info->last_pos != filp->f_pos) { + free_rb_tree_fname(&info->root); + info->curr_node = NULL; + info->extra_fname = NULL; + info->curr_hash = pos2maj_hash(filp, filp->f_pos); + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); + } + + /* + * If there are any leftover names on the hash collision + * chain, return them first. + */ + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; + info->extra_fname = NULL; + goto next_node; + } else if (!info->curr_node) + info->curr_node = rb_first(&info->root); + + while (1) { + /* + * Fill the rbtree if we have no more entries, + * or the inode has changed since we last read in the + * cached entries. + */ + if ((!info->curr_node) || + (filp->f_version != inode->i_version)) { + info->curr_node = NULL; + free_rb_tree_fname(&info->root); + filp->f_version = inode->i_version; + ret = ext4_htree_fill_tree(filp, info->curr_hash, + info->curr_minor_hash, + &info->next_hash); + if (ret < 0) + return ret; + if (ret == 0) { + filp->f_pos = ext4_get_htree_eof(filp); + break; + } + info->curr_node = rb_first(&info->root); + } + + fname = rb_entry(info->curr_node, struct fname, rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + if (call_filldir(filp, dirent, filldir, fname)) + break; + next_node: + info->curr_node = rb_next(info->curr_node); + if (info->curr_node) { + fname = rb_entry(info->curr_node, struct fname, + rb_hash); + info->curr_hash = fname->hash; + info->curr_minor_hash = fname->minor_hash; + } else { + if (info->next_hash == ~0) { + filp->f_pos = ext4_get_htree_eof(filp); + break; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } +finished: + info->last_pos = filp->f_pos; + return 0; +} + +static int ext4_release_dir(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +const struct file_operations ext4_dir_operations = { + .llseek = ext4_dir_llseek, + .read = generic_read_dir, + .readdir = ext4_readdir, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .fsync = ext4_sync_file, + .release = ext4_release_dir, +}; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h new file mode 100644 index 00000000..0e01e90a --- /dev/null +++ b/fs/ext4/ext4.h @@ -0,0 +1,2372 @@ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +/* + * The fourth extended filesystem constants/structures + */ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 0x0001 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 0x0002 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 0x0004 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 0x0008 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 0x0010 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 0x0020 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 0x0040 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 0x0200 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 +/* Use reserved root blocks if needed */ +#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; + /* logical block in target inode */ + ext4_lblk_t logical; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ + ext4_fsblk_t pright; + /* flags. see above EXT4_MB_HINT_* */ + unsigned int flags; +}; + +/* + * Logical to physical block mapping, used by ext4_map_blocks() + * + * This structure is used to pass requests into ext4_map_blocks() as + * well as to store the information returned by ext4_map_blocks(). It + * takes less room on the stack than a struct buffer_head. + */ +#define EXT4_MAP_NEW (1 << BH_New) +#define EXT4_MAP_MAPPED (1 << BH_Mapped) +#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) +#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_UNINIT (1 << BH_Uninit) +/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of + * ext4_map_blocks wants to know whether or not the underlying cluster has + * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that + * the requested mapping was from previously mapped (or delayed allocated) + * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster + * should never appear on buffer_head's state flags. + */ +#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) +#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ + EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) + +struct ext4_map_blocks { + ext4_fsblk_t m_pblk; + ext4_lblk_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* + * For delayed allocation tracking + */ +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; + +/* + * Flags for ext4_io_end->flags + */ +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_ERROR 0x0002 +#define EXT4_IO_END_QUEUED 0x0004 +#define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_IN_FSYNC 0x0010 + +struct ext4_io_page { + struct page *p_page; + atomic_t p_count; +}; + +#define MAX_IO_PAGES 128 + +/* + * For converting uninitialized extents on a work queue. + * + * 'page' is only used from the writepage() path; 'pages' is only used for + * buffered writes; they are used to keep page references until conversion + * takes place. For AIO/DIO, neither field is filled in. + */ +typedef struct ext4_io_end { + struct list_head list; /* per-file finished IO list */ + struct inode *inode; /* file being written to */ + unsigned int flag; /* unwritten or not */ + struct page *page; /* for writepage() path */ + loff_t offset; /* offset in the file */ + ssize_t size; /* size of the extent */ + struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ + int num_io_pages; /* for writepages() */ + struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ +} ext4_io_end_t; + +struct ext4_io_submit { + int io_op; + struct bio *io_bio; + ext4_io_end_t *io_end; + struct ext4_io_page *io_page; + sector_t io_next_block; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ +#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#define EXT4_MAX_BLOCK_LOG_SIZE 16 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) +#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ + EXT4_SB(s)->s_cluster_bits) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) + +/* Translate a block number to a cluster number */ +#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) +/* Translate a cluster number to a block number */ +#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) +/* Translate # of blks to # of clusters */ +#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ + (sbi)->s_cluster_bits) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __u32 bg_reserved2[3]; +}; + +/* + * Structure of a flex block group info + */ + +struct flex_groups { + atomic_t free_inodes; + atomic_t free_clusters; + atomic_t used_dirs; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ + +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + +/* + * Inode flags used for atomic set/get + */ +enum { + EXT4_INODE_SECRM = 0, /* Secure deletion */ + EXT4_INODE_UNRM = 1, /* Undelete */ + EXT4_INODE_COMPR = 2, /* Compress file */ + EXT4_INODE_SYNC = 3, /* Synchronous updates */ + EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ + EXT4_INODE_APPEND = 5, /* writes to file may only append */ + EXT4_INODE_NODUMP = 6, /* do not dump file */ + EXT4_INODE_NOATIME = 7, /* do not update atime */ +/* Reserved for compression usage... */ + EXT4_INODE_DIRTY = 8, + EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ + EXT4_INODE_NOCOMPR = 10, /* Don't compress */ + EXT4_INODE_ECOMPR = 11, /* Compression error */ +/* End compression flags --- maybe not all used */ + EXT4_INODE_INDEX = 12, /* hash-indexed directory */ + EXT4_INODE_IMAGIC = 13, /* AFS directory */ + EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ + EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ + EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ + EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ + EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ + EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ + EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ + EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ +}; + +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ + printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ + EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } + +/* + * Since it's pretty easy to mix up bit numbers and hex values, and we + * can't do a compile-time test for ENUM values, we use a run-time + * test to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop + * out so it won't cost any extra space in the compiled kernel image. + * But it's important that these values are the same, since we are + * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL + * must be consistent with the values of FS_XXX_FL defined in + * include/linux/fs.h and the on-disk values found in ext2, ext3, and + * ext4 filesystems, and of course the values defined in e2fsprogs. + * + * It's not paranoia if the Murphy's Law really *is* out to get you. :-) + */ +static inline void ext4_check_flag_values(void) +{ + CHECK_FLAG_VALUE(SECRM); + CHECK_FLAG_VALUE(UNRM); + CHECK_FLAG_VALUE(COMPR); + CHECK_FLAG_VALUE(SYNC); + CHECK_FLAG_VALUE(IMMUTABLE); + CHECK_FLAG_VALUE(APPEND); + CHECK_FLAG_VALUE(NODUMP); + CHECK_FLAG_VALUE(NOATIME); + CHECK_FLAG_VALUE(DIRTY); + CHECK_FLAG_VALUE(COMPRBLK); + CHECK_FLAG_VALUE(NOCOMPR); + CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(INDEX); + CHECK_FLAG_VALUE(IMAGIC); + CHECK_FLAG_VALUE(JOURNAL_DATA); + CHECK_FLAG_VALUE(NOTAIL); + CHECK_FLAG_VALUE(DIRSYNC); + CHECK_FLAG_VALUE(TOPDIR); + CHECK_FLAG_VALUE(HUGE_FILE); + CHECK_FLAG_VALUE(EXTENTS); + CHECK_FLAG_VALUE(EA_INODE); + CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(RESERVED); +} + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +struct compat_ext4_new_group_input { + u32 group; + compat_u64 block_bitmap; + compat_u64 inode_bitmap; + compat_u64 inode_table; + u32 blocks_count; + u16 reserved_blocks; + u16 unused; +}; +#endif + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* Indexes used to index group tables in ext4_new_group_data */ +enum { + BLOCK_BITMAP = 0, /* block bitmap */ + INODE_BITMAP, /* inode bitmap */ + INODE_TABLE, /* inode tables */ + GROUP_TABLE_COUNT, +}; + +/* + * Flags used by ext4_map_blocks() + */ + /* Allocate any needed blocks and/or convert an unitialized + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unitialized extent */ +#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Caller is from the delayed allocation writeout path, + so set the magic i_delalloc_reserve_flag after taking the + inode allocation semaphore for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +#define EXT4_GET_BLOCKS_PRE_IO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 +#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Punch out blocks of an extent */ +#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 + /* Don't normalize allocation size (used for fallocate) */ +#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 + /* Request will not result in inode size update (user for fallocate) */ +#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 + +/* + * Flags used by ext4_free_blocks + */ +#define EXT4_FREE_BLOCKS_METADATA 0x0001 +#define EXT4_FREE_BLOCKS_FORGET 0x0002 +#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 +#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 +#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 +#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 + +/* + * Flags used by ext4_discard_partial_page_buffers + */ +#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) + +#if defined(__KERNEL__) && defined(CONFIG_COMPAT) +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +#endif + +/* Max physical block we can address w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ +}; + +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + return cpu_to_le32((sizeof(time->tv_sec) > 4 ? + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (sizeof(time->tv_sec) > 4) + time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) + << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (inode)->xtime.tv_nsec = 0; \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ + else \ + (einode)->xtime.tv_nsec = 0; \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +/* + * storage for cached extent + * If ec_len == 0, then the cache is invalid. + * If ec_start == 0, then the cache represents a gap (null mapping) + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ +}; + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + ext4_lblk_t i_dir_start_lookup; +#if (BITS_PER_LONG < 64) + unsigned long i_state_flags; /* Dynamic state flags */ +#endif + unsigned long i_flags; + +#ifdef CONFIG_EXT4_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode *jinode; + + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, these refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + ext4_lblk_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; + + /* on-disk additional length */ + __u16 i_extra_isize; + +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif + + /* completed IOs that might need unwritten extents handling */ + struct list_head i_completed_io_list; + spinlock_t i_completed_io_lock; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; + atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ + + spinlock_t i_block_reservation_lock; + + /* + * Transactions that contain inode's metadata needed to complete + * fsync and fdatasync, respectively. + */ + tid_t i_sync_tid; + tid_t i_datasync_tid; +}; + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_ERRORS_MASK 0x00070 +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ +#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ + +#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly + specified delalloc */ + +#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ + ~EXT4_MOUNT_##opt +#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ + EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) + +#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ + ~EXT4_MOUNT2_##opt +#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ + EXT4_MOUNT2_##opt +#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ + EXT4_MOUNT2_##opt) + +#define ext4_test_and_set_bit __test_and_set_bit_le +#define ext4_set_bit __set_bit_le +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_test_and_clear_bit __test_and_clear_bit_le +#define ext4_clear_bit __clear_bit_le +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit test_bit_le +#define ext4_find_next_zero_bit find_next_zero_bit_le +#define ext4_find_next_bit find_next_bit_le + +extern void ext4_set_bits(void *bm, int cur, int len); + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_log_cluster_size; /* Allocation cluster size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_clusters_per_group; /* # Clusters per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_jnl_backup_type; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad; + __le16 s_reserved_pad; + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_usr_quota_inum; /* inode for tracking user quota */ + __le32 s_grp_quota_inum; /* inode for tracking group quota */ + __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ + __le32 s_reserved[109]; /* Padding to the end of the block */ +}; + +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + +#ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ + +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_clusters_per_group; /* Number of clusters in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + unsigned int s_cluster_ratio; /* Number of blocks per cluster */ + unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; + unsigned int s_def_mount_opt; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeclusters_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyclusters_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + + /* Journaling */ + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + unsigned long s_resize_flags; /* Flags indicating if there + is a resizer */ + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* stats for buddy allocator */ + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group __percpu *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; + + /* workqueue for dio unwritten */ + struct workqueue_struct *dio_unwritten_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; + + /* Lazy inode table initialization info */ + struct ext4_li_request *s_li_request; + /* Wait multiplier for lazy initialization thread */ + unsigned int s_li_wait_mult; + + /* Kernel thread for multiple mount protection */ + struct task_struct *s_mmp_tsk; + + /* record the last minlen when FITRIM is called. */ + atomic_t s_last_trim_minblks; +}; + +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} + +static inline void ext4_set_io_unwritten_flag(struct inode *inode, + struct ext4_io_end *io_end) +{ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } +} + +/* + * Inode dynamic state flags + */ +enum { + EXT4_STATE_JDATA, /* journaled data exists */ + EXT4_STATE_NEW, /* inode is newly created */ + EXT4_STATE_XATTR, /* has in-inode xattrs */ + EXT4_STATE_NO_EXPAND, /* No space for expansion */ + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + EXT4_STATE_NEWENTRY, /* File just added to dir */ + EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ +}; + +#define EXT4_INODE_BIT_FNS(name, field, offset) \ +static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ +{ \ + return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ +{ \ + set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} \ +static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ +{ \ + clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ +} + +EXT4_INODE_BIT_FNS(flag, flags, 0) +#if (BITS_PER_LONG < 64) +EXT4_INODE_BIT_FNS(state, state_flags, 0) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + (ei)->i_state_flags = 0; +} +#else +EXT4_INODE_BIT_FNS(state, flags, 32) + +static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) +{ + /* We depend on the fact that callers will set i_flags */ +} +#endif +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) +#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) +#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 +#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 +#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 +#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ +#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ +#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ +#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ +#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ + +#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR) + +#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ + EXT4_FEATURE_RO_COMPAT_BIGALLOC) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 + +/* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +/* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ + ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) +#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + + +/* 32 and 64 bit signed EOF for dx directories */ +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) + + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); + +/* + * Timeout and state flag for lazy initialization inode thread. + */ +#define EXT4_DEF_LI_WAIT_MULT 10 +#define EXT4_DEF_LI_MAX_START_DELAY 5 +#define EXT4_LAZYINIT_QUIT 0x0001 +#define EXT4_LAZYINIT_RUNNING 0x0002 + +/* + * Lazy inode table initialization info + */ +struct ext4_lazy_init { + unsigned long li_state; + struct list_head li_request_list; + struct mutex li_list_mtx; +}; + +struct ext4_li_request { + struct super_block *lr_super; + struct ext4_sb_info *lr_sbi; + ext4_group_t lr_next_group; + struct list_head lr_request; + unsigned long lr_next_sched; + unsigned long lr_timeout; +}; + +struct ext4_features { + struct kobject f_kobj; + struct completion f_kobj_unregister; +}; + +/* + * This structure will be used for multiple mount protection. It will be + * written into the block number saved in the s_mmp_block field in the + * superblock. Programs that check MMP should assume that if + * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe + * to use the filesystem, regardless of how old the timestamp is. + */ +#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ +#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ +#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ +#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ + +struct mmp_struct { + __le32 mmp_magic; /* Magic number for MMP */ + __le32 mmp_seq; /* Sequence no. updated periodically */ + + /* + * mmp_time, mmp_nodename & mmp_bdevname are only used for information + * purposes and do not affect the correctness of the algorithm + */ + __le64 mmp_time; /* Time last updated */ + char mmp_nodename[64]; /* Node which last updated MMP block */ + char mmp_bdevname[32]; /* Bdev which last updated MMP block */ + + /* + * mmp_check_interval is used to verify if the MMP block has been + * updated on the block device. The value is updated based on the + * maximum time to write the MMP block during an update cycle. + */ + __le16 mmp_check_interval; + + __le16 mmp_pad1; + __le32 mmp_pad2[227]; +}; + +/* arguments passed to the mmp thread */ +struct mmpd_data { + struct buffer_head *bh; /* bh from initial read_mmp_block() */ + struct super_block *sb; /* super block of the fs */ +}; + +/* + * Check interval multiplier + * The MMP block is written every update interval and initially checked every + * update interval x the multiplier (the value is then adapted based on the + * write latency). The reason is that writes can be delayed under load and we + * don't want readers to incorrectly assume that the filesystem is no longer + * in use. + */ +#define EXT4_MMP_CHECK_MULT 2UL + +/* + * Minimum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL + +/* + * Maximum interval for MMP checking in seconds. + */ +#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* bitmap.c */ +extern unsigned int ext4_count_free(struct buffer_head *, unsigned); + +/* balloc.c */ +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, + unsigned int flags, + unsigned long *count, + int *errp); +extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, + s64 nclusters, unsigned int flags); +extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); + +extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, + ext4_group_t block_group); +extern int ext4_wait_block_bitmap(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh); +extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern void ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern unsigned ext4_free_clusters_after_init(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +extern unsigned ext4_num_overhead_clusters(struct super_block *sb, + ext4_group_t block_group, + struct ext4_group_desc *gdp); +ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); + +/* dir.c */ +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct file *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ + (de), (bh), (offset))) +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); +extern int ext4_flush_completed_IO(struct inode *); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, + const struct qstr *qstr, __u32 goal, + uid_t *owner); +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +extern int ext4_init_inode_table(struct super_block *sb, + ext4_group_t group, int barrier); +extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); + +/* mballoc.c */ +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_discard_preallocations(struct inode *); +extern int __init ext4_init_mballoc(void); +extern void ext4_exit_mballoc(void); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags); +extern int ext4_mb_add_groupinfo(struct super_block *sb, + ext4_group_t i, struct ext4_group_desc *desc); +extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); +extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); + +/* inode.c */ +struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +struct buffer_head *ext4_bread(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern int ext4_write_inode(struct inode *, struct writeback_control *); +extern int ext4_setattr(struct dentry *, struct iattr *); +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +extern void ext4_evict_inode(struct inode *); +extern void ext4_clear_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); +extern void ext4_dirty_inode(struct inode *, int); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern int ext4_can_truncate(struct inode *inode); +extern void ext4_truncate(struct inode *); +extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); +extern void ext4_set_inode_flags(struct inode *); +extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags); +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); + +/* indirect.c */ +extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs); +extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); +extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); +extern void ext4_ind_truncate(struct inode *inode); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *); + +/* namei.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); +extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern void *ext4_kvmalloc(size_t size, gfp_t flags); +extern void *ext4_kvzalloc(size_t size, gfp_t flags); +extern void ext4_kvfree(void *ptr); +extern __printf(4, 5) +void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ + __LINE__, ## message) +extern __printf(5, 6) +void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern __printf(5, 6) +void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, + const char *, ...); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern __printf(4, 5) +void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...); +#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ + __LINE__, ## message) +extern __printf(4, 5) +void __ext4_warning(struct super_block *, const char *, unsigned int, + const char *, ...); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ + __LINE__, ## message) +extern __printf(3, 4) +void ext4_msg(struct super_block *, const char *, const char *, ...); +extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, + const char *, unsigned int, const char *); +#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ + __LINE__, msg) +extern __printf(7, 8) +void __ext4_grp_locked_error(const char *, unsigned int, + struct super_block *, ext4_group_t, + unsigned long, ext4_fsblk_t, + const char *, ...); +#define ext4_grp_locked_error(sb, grp, message...) \ + __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) +extern void ext4_update_dynamic_rev(struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, + __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +{ + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} + +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, + ext4_group_t block_group) +{ + return block_group >> sbi->s_log_groups_per_flex; +} + +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) +{ + return 1 << sbi->s_log_groups_per_flex; +} + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ +} while (0) + +#ifdef CONFIG_SMP +/* Each CPU can accumulate percpu_counter_batch clusters in their local + * counters. So we need to make sure we have free clusters more + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) +#else +#define EXT4_FREECLUSTERS_WATERMARK 0 +#endif + +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); + return ; +} + +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + +#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ + (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_SET_TRIMMED(grp) \ + (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) +#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ + (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) + +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + spin_unlock(ext4_group_lock_ptr(sb, group)); +} + +static inline void ext4_mark_super_dirty(struct super_block *sb) +{ + if (EXT4_SB(sb)->s_journal == NULL) + sb->s_dirt =1; +} + +/* + * Block validity checking + */ +#define ext4_check_indirect_blockref(inode, bh) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ + EXT4_ADDR_PER_BLOCK((inode)->i_sb)) + +#define ext4_ind_check_inode(inode) \ + ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ + EXT4_NDIR_BLOCKS) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; +extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); + +/* symlink.c */ +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init ext4_init_system_zone(void); +extern void ext4_exit_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); +extern int ext4_check_blockref(const char *, unsigned int, + struct inode *, __le32 *, unsigned int); + +/* extents.c */ +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); +extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern void ext4_ext_truncate(struct inode *); +extern int ext4_ext_punch_hole(struct file *file, loff_t offset, + loff_t length); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct file *file, int mode, loff_t offset, + loff_t len); +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + ssize_t len); +extern int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +/* move_extent.c */ +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + +/* page-io.c */ +extern int __init ext4_init_pageio(void); +extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); +extern void ext4_free_io_end(ext4_io_end_t *io); +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); +extern int ext4_end_io_nolock(ext4_io_end_t *io); +extern void ext4_io_submit(struct ext4_io_submit *io); +extern int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc); + +/* mmp.c */ +extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); + +/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ +enum ext4_state_bits { + BH_Uninit /* blocks are allocated but uninitialized on disk */ + = BH_JBDPrivateStart, + BH_AllocFromCluster, /* allocated blocks were part of already + * allocated cluster. Note that this flag will + * never, ever appear in a buffer_head's state + * flag. See EXT4_MAP_FROM_CLUSTER to see where + * this is used. */ + BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This + * flag is set when ext4_map_blocks is called on a + * delayed allocated block to get its real mapping. */ +}; + +BUFFER_FNS(Uninit, uninit) +TAS_BUFFER_FNS(Uninit, uninit) +BUFFER_FNS(Da_Mapped, da_mapped) + +/* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +/* For ioend & aio unwritten conversion wait queues */ +#define EXT4_WQ_HASH_SZ 37 +#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ + EXT4_WQ_HASH_SZ]) +extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +#define EXT4_RESIZING 0 +extern int ext4_resize_begin(struct super_block *sb); +extern void ext4_resize_end(struct super_block *sb); + +#endif /* __KERNEL__ */ + +#include "ext4_extents.h" + +#endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h new file mode 100644 index 00000000..0f58b86e --- /dev/null +++ b/fs/ext4/ext4_extents.h @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _EXT4_EXTENTS +#define _EXT4_EXTENTS + +#include "ext4.h" + +/* + * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks + * becomes very small, so index split, in-depth growing and + * other hard changes happen much more often. + * This is for debug purposes only. + */ +#define AGGRESSIVE_TEST_ + +/* + * With EXTENTS_STATS defined, the number of blocks and extents + * are collected in the truncate path. They'll be shown at + * umount time. + */ +#define EXTENTS_STATS__ + +/* + * If CHECK_BINSEARCH is defined, then the results of the binary search + * will also be checked by linear search. + */ +#define CHECK_BINSEARCH__ + +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +/* + * If EXT_STATS is defined then stats numbers are collected. + * These number will be displayed at umount time. + */ +#define EXT_STATS_ + + +/* + * ext4_inode has i_block array (60 bytes total). + * The first 12 bytes store ext4_extent_header; + * the remainder stores an array of ext4_extent. + */ + +/* + * This is the extent on-disk structure. + * It's used at the bottom of the tree. + */ +struct ext4_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start_lo; /* low 32 bits of physical block */ +}; + +/* + * This is index on-disk structure. + * It's used at all the levels except the bottom. + */ +struct ext4_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf_lo; /* pointer to the physical block of the next * + * level. leaf or next index could be there */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * Each block (leaves and indexes), even inode-stored has header. + */ +struct ext4_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) + +/* + * Array of ext4_ext_path contains path to some extent. + * Creation/lookup routines use it for traversal/splitting/etc. + * Truncate uses it to simulate recursive walking. + */ +struct ext4_ext_path { + ext4_fsblk_t p_block; + __u16 p_depth; + struct ext4_extent *p_ext; + struct ext4_extent_idx *p_idx; + struct ext4_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 + +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff + +/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this + * particular extent is an initialized extent or an uninitialized (i.e. + * preallocated). + * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an + * uninitialized extent. + * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an + * uninitialized one. In other words, if MSB of ee_len is set, it is an + * uninitialized extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an uninitialized extent of zero length and + * thus we make it as a special case of initialized extent with 0x8000 length. + * This way we get better extent-to-group alignment for initialized extents. + * Hence, the maximum number of blocks we can have in an *initialized* + * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + */ +#define EXT_INIT_MAX_LEN (1UL << 15) +#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + +static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext4_extent_header *) EXT4_I(inode)->i_data; +} + +static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext4_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void +ext4_ext_invalidate_cache(struct inode *inode) +{ + EXT4_I(inode)->i_cached_extent.ec_len = 0; +} + +static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +{ + /* We can not have an uninitialized extent of zero length! */ + BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); + ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +{ + /* Extent with ee_len of 0x8000 is treated as an initialized extent */ + return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) +{ + return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? + le16_to_cpu(ext->ee_len) : + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); +} + +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) +{ + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); +} + +/* + * ext4_ext_pblock: + * combine low and high parts of physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ex->ee_start_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; + return block; +} + +/* + * ext4_idx_pblock: + * combine low and high parts of a leaf physical block number into ext4_fsblk_t + */ +static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) +{ + ext4_fsblk_t block; + + block = le32_to_cpu(ix->ei_leaf_lo); + block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; + return block; +} + +/* + * ext4_ext_store_pblock: + * stores a large physical block number into an extent struct, + * breaking it into parts + */ +static inline void ext4_ext_store_pblock(struct ext4_extent *ex, + ext4_fsblk_t pb) +{ + ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +/* + * ext4_idx_store_pblock: + * stores a large physical block number into an index struct, + * breaking it into parts + */ +static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, + ext4_fsblk_t pb) +{ + ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & + 0xffff); +} + +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse); +#endif /* _EXT4_EXTENTS */ + diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c new file mode 100644 index 00000000..aca17901 --- /dev/null +++ b/fs/ext4/ext4_jbd2.c @@ -0,0 +1,154 @@ +/* + * Interface between ext4 and JBD + */ + +#include "ext4_jbd2.h" + +#include + +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, bh, + handle, err); + } + return err; +} + +/* + * The ext4 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + * + * If the handle isn't valid we're not journaling, but we still need to + * call into ext4_journal_revoke() to put the buffer head. + */ +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) +{ + int err; + + might_sleep(); + + trace_ext4_forget(inode, is_metadata, blocknr); + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %x\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* In the no journal case, we can just do a bforget and return */ + if (!ext4_handle_valid(handle)) { + bforget(bh); + return 0; + } + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext4_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call jbd2_journal_forget"); + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + return err; + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call jbd2_journal_revoke"); + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_abort(inode->i_sb, where, line, + "error %d when attempting revoke", err); + } + BUFFER_TRACE(bh, "exit"); + return err; +} + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } + return err; +} + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) { + /* Errors can only happen if there is a bug */ + handle->h_err = err; + __ext4_journal_stop(where, line, handle); + } + } else { + if (inode) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + struct ext4_super_block *es; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_block = + cpu_to_le64(bh->b_blocknr); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "IO error syncing itable block"); + err = -EIO; + } + } + } + return err; +} + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb) +{ + struct buffer_head *bh = EXT4_SB(sb)->s_sbh; + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else + sb->s_dirt = 1; + return err; +} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h new file mode 100644 index 00000000..83b20fcf --- /dev/null +++ b/fs/ext4/ext4_jbd2.h @@ -0,0 +1,399 @@ +/* + * ext4_jbd2.h + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext4-specific journaling extensions. + */ + +#ifndef _EXT4_JBD2_H +#define _EXT4_JBD2_H + +#include +#include +#include "ext4.h" + +#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify up to + * 5 levels of tree + root which are stored in the inode. */ + +#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT4_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT4_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT4_RESERVE_TRANS_BLOCKS 12U + +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only data block */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) + +#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 +#endif +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) + +/** + * struct ext4_journal_cb_entry - Base structure for callback information. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to ext4_journal_callback_add(). + */ +struct ext4_journal_cb_entry { + /* list information for other callbacks attached to the same handle */ + struct list_head jce_list; + + /* Function to call with this callback structure */ + void (*jce_func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int error); + + /* user data goes here */ +}; + +/** + * ext4_journal_callback_add: add a function to call after transaction commit + * @handle: active journal transaction handle to register callback on + * @func: callback function to call after the transaction has committed: + * @sb: superblock of current filesystem for transaction + * @jce: returned journal callback data + * @rc: journal state at commit (0 = transaction committed properly) + * @jce: journal callback data (internal and function private data struct) + * + * The registered function will be called in the context of the journal thread + * after the transaction for which the handle was created has completed. + * + * No locks are held when the callback function is called, so it is safe to + * call blocking functions from within the callback, but the callback should + * not block or run for too long, or the filesystem will be blocked waiting for + * the next transaction to commit. No journaling functions can be used, or + * there is a risk of deadlock. + * + * There is no guaranteed calling order of multiple registered callbacks on + * the same transaction. + */ +static inline void ext4_journal_callback_add(handle_t *handle, + void (*func)(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc), + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + /* Add the jce to transaction's private list */ + jce->jce_func = func; + spin_lock(&sbi->s_md_lock); + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); +} + +/** + * ext4_journal_callback_del: delete a registered callback + * @handle: active journal transaction handle on which callback was registered + * @jce: registered journal callback entry to unregister + */ +static inline void ext4_journal_callback_del(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + struct ext4_sb_info *sbi = + EXT4_SB(handle->h_transaction->t_journal->j_private); + + spin_lock(&sbi->s_md_lock); + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); +} + +int +ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc); + +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext4 calls into JBD. + */ +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); + +int __ext4_journal_get_create_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); + +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb); + +#define ext4_journal_get_write_access(handle, bh) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) +#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) +#define ext4_journal_get_create_access(handle, bh) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) +#define ext4_handle_dirty_super(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) + +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); + +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) + +/* Note: Do not use this for NULL handles. This is only to determine if + * a properly allocated handle is using a journal or not. */ +static inline int ext4_handle_valid(handle_t *handle) +{ + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline void ext4_handle_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +{ + if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) + return 0; + return 1; +} + +static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +{ + return ext4_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext4_journal_stop(handle) \ + __ext4_journal_stop(__func__, __LINE__, (handle)) + +static inline handle_t *ext4_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext4_journal_extend(handle_t *handle, int nblocks) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks); + return 0; +} + +static inline int ext4_journal_restart(handle_t *handle, int nblocks) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_restart(handle, nblocks); + return 0; +} + +static inline int ext4_journal_blocks_per_page(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_page(inode); + return 0; +} + +static inline int ext4_journal_force_commit(journal_t *journal) +{ + if (journal) + return jbd2_journal_force_commit(journal); + return 0; +} + +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); + return 0; +} + +static inline void ext4_update_inode_fsync_trans(handle_t *handle, + struct inode *inode, + int datasync) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + ei->i_datasync_tid = handle->h_transaction->t_tid; + } +} + +/* super.c */ +int ext4_force_commit(struct super_block *sb); + +/* + * Ext4 inode journal modes + */ +#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ +#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ +#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ + +static inline int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC)) + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + else + BUG(); +} + +static inline int ext4_should_journal_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; +} + +static inline int ext4_should_order_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; +} + +static inline int ext4_should_writeback_data(struct inode *inode) +{ + return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; +} + +/* + * This function controls whether or not we should try to go down the + * dioread_nolock code paths, which makes it safe to avoid taking + * i_mutex for direct I/O reads. This only works for extent-based + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. + */ +static inline int ext4_should_dioread_nolock(struct inode *inode) +{ + if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return 0; + if (ext4_should_journal_data(inode)) + return 0; + return 1; +} + +#endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c new file mode 100644 index 00000000..abcdeab6 --- /dev/null +++ b/fs/ext4/extents.c @@ -0,0 +1,4866 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * Architecture independence: + * Copyright (c) 2005, Bull S.A. + * Written by Pierre Peiffer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +/* + * Extents support for EXT4 + * + * TODO: + * - ext4*_error() should be used in some situations + * - analyze all BUG()/BUG_ON(), use -EIO where appropriate + * - smart tree reduction + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" + +#include + +/* + * used by extent splitting. + */ +#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ + due to ENOSPC */ +#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ +#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ + +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags); + +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags); + +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) +{ + int err; + + if (!ext4_handle_valid(handle)) + return 0; + if (handle->h_buffer_credits > needed) + return 0; + err = ext4_journal_extend(handle, needed); + if (err <= 0) + return err; + err = ext4_truncate_restart_trans(handle, inode, needed); + if (err == 0) + err = -EAGAIN; + + return err; +} + +/* + * could return: + * - EROFS + * - ENOMEM + */ +static int ext4_ext_get_access(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + if (path->p_bh) { + /* path points to block */ + return ext4_journal_get_write_access(handle, path->p_bh); + } + /* path points to leaf/index in inode body */ + /* we use in-core data, no need to protect them */ + return 0; +} + +/* + * could return: + * - EROFS + * - ENOMEM + * - EIO + */ +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err; + if (path->p_bh) { + /* path points to block */ + err = __ext4_handle_dirty_metadata(where, line, handle, + inode, path->p_bh); + } else { + /* path points to leaf/index in inode body */ + err = ext4_mark_inode_dirty(handle, inode); + } + return err; +} + +static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t block) +{ + if (path) { + int depth = path->p_depth; + struct ext4_extent *ex; + + /* + * Try to predict block placement assuming that we are + * filling in a file which will eventually be + * non-sparse --- i.e., in the case of libbfd writing + * an ELF object sections out-of-order but in a way + * the eventually results in a contiguous object or + * executable file, or some database extending a table + * space file. However, this is actually somewhat + * non-ideal if we are writing a sparse file such as + * qemu or KVM writing a raw image file that is going + * to stay fairly sparse, since it will end up + * fragmenting the file system's free space. Maybe we + * should have some hueristics or some way to allow + * userspace to pass a hint to file system, + * especially if the latter case turns out to be + * common. + */ + ex = path[depth].p_ext; + if (ex) { + ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); + ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); + + if (block > ext_block) + return ext_pblk + (block - ext_block); + else + return ext_pblk - (ext_block - block); + } + + /* it looks like index is empty; + * try to find starting block from index itself */ + if (path[depth].p_bh) + return path[depth].p_bh->b_blocknr; + } + + /* OK. use inode's group */ + return ext4_inode_to_goal_block(inode); +} + +/* + * Allocation for a meta data block + */ +static ext4_fsblk_t +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex, int *err, unsigned int flags) +{ + ext4_fsblk_t goal, newblock; + + goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, err); + return newblock; +} + +static inline int ext4_ext_space_block(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (!check && size > 6) + size = 6; +#endif + return size; +} + +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) +{ + int size; + + size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (!check && size > 5) + size = 5; +#endif + return size; +} + +static inline int ext4_ext_space_root(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent); +#ifdef AGGRESSIVE_TEST + if (!check && size > 3) + size = 3; +#endif + return size; +} + +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) +{ + int size; + + size = sizeof(EXT4_I(inode)->i_data); + size -= sizeof(struct ext4_extent_header); + size /= sizeof(struct ext4_extent_idx); +#ifdef AGGRESSIVE_TEST + if (!check && size > 4) + size = 4; +#endif + return size; +} + +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + int idxs; + + idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx)); + + /* + * If the new delayed allocation block is contiguous with the + * previous da block, it can share index blocks with the + * previous block, so we only need to allocate a new index + * block every idxs leaf blocks. At ldxs**2 blocks, we need + * an additional index block, and at ldxs**3 blocks, yet + * another index blocks. + */ + if (ei->i_da_metadata_calc_len && + ei->i_da_metadata_calc_last_lblock+1 == lblock) { + int num = 0; + + if ((ei->i_da_metadata_calc_len % idxs) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { + num++; + ei->i_da_metadata_calc_len = 0; + } else + ei->i_da_metadata_calc_len++; + ei->i_da_metadata_calc_last_lblock++; + return num; + } + + /* + * In the worst case we need a new set of index blocks at + * every level of the inode's extent tree. + */ + ei->i_da_metadata_calc_len = 1; + ei->i_da_metadata_calc_last_lblock = lblock; + return ext_depth(inode) + 1; +} + +static int +ext4_ext_max_entries(struct inode *inode, int depth) +{ + int max; + + if (depth == ext_depth(inode)) { + if (depth == 0) + max = ext4_ext_space_root(inode, 1); + else + max = ext4_ext_space_root_idx(inode, 1); + } else { + if (depth == 0) + max = ext4_ext_space_block(inode, 1); + else + max = ext4_ext_space_block_idx(inode, 1); + } + + return max; +} + +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext4_ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + + if (len == 0) + return 0; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = ext4_idx_pblock(ext_idx); + + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + ext++; + entries--; + } + } else { + struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth) +{ + const char *error_msg; + int max = 0; + + if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { + error_msg = "invalid magic"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { + error_msg = "unexpected eh_depth"; + goto corrupted; + } + if (unlikely(eh->eh_max == 0)) { + error_msg = "invalid eh_max"; + goto corrupted; + } + max = ext4_ext_max_entries(inode, depth); + if (unlikely(le16_to_cpu(eh->eh_max) > max)) { + error_msg = "too large eh_max"; + goto corrupted; + } + if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { + error_msg = "invalid eh_entries"; + goto corrupted; + } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } + return 0; + +corrupted: + ext4_error_inode(inode, function, line, 0, + "bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + error_msg, le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); + + return -EIO; +} + +#define ext4_ext_check(inode, eh, depth) \ + __ext4_ext_check(__func__, __LINE__, inode, eh, depth) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +} + +#ifdef EXT_DEBUG +static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) +{ + int k, l = path->p_depth; + + ext_debug("path:"); + for (k = 0; k <= l; k++, path++) { + if (path->p_idx) { + ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + } else if (path->p_ext) { + ext_debug(" %d:[%d]%d:%llu ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_get_actual_len(path->p_ext), + ext4_ext_pblock(path->p_ext)); + } else + ext_debug(" []"); + } + ext_debug("\n"); +} + +static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) +{ + int depth = ext_depth(inode); + struct ext4_extent_header *eh; + struct ext4_extent *ex; + int i; + + if (!path) + return; + + eh = path[depth].p_hdr; + ex = EXT_FIRST_EXTENT(eh); + + ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { + ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); + } + ext_debug("\n"); +} + +static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, + ext4_fsblk_t newblock, int level) +{ + int depth = ext_depth(inode); + struct ext4_extent *ex; + + if (depth != level) { + struct ext4_extent_idx *idx; + idx = path[level].p_idx; + while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { + ext_debug("%d: move %d:%llu in new index %llu\n", level, + le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), + newblock); + idx++; + } + + return; + } + + ex = path[depth].p_ext; + while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + le32_to_cpu(ex->ee_block), + ext4_ext_pblock(ex), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + newblock); + ex++; + } +} + +#else +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) +#define ext4_ext_show_move(inode, path, newblock, level) +#endif + +void ext4_ext_drop_refs(struct ext4_ext_path *path) +{ + int depth = path->p_depth; + int i; + + for (i = 0; i <= depth; i++, path++) + if (path->p_bh) { + brelse(path->p_bh); + path->p_bh = NULL; + } +} + +/* + * ext4_ext_binsearch_idx: + * binary search for the closest index of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch_idx(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent_idx *r, *l, *m; + + + ext_debug("binsearch for %u(idx): ", block); + + l = EXT_FIRST_INDEX(eh) + 1; + r = EXT_LAST_INDEX(eh); + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ei_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), + m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); + } + + path->p_idx = l - 1; + ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent_idx *chix, *ix; + int k; + + chix = ix = EXT_FIRST_INDEX(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { + if (k != 0 && + le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", + le32_to_cpu(ix->ei_block), + le32_to_cpu(ix[-1].ei_block)); + } + BUG_ON(k && le32_to_cpu(ix->ei_block) + <= le32_to_cpu(ix[-1].ei_block)); + if (block < le32_to_cpu(ix->ei_block)) + break; + chix = ix; + } + BUG_ON(chix != path->p_idx); + } +#endif + +} + +/* + * ext4_ext_binsearch: + * binary search for closest extent of the given block + * the header must be checked before calling this + */ +static void +ext4_ext_binsearch(struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t block) +{ + struct ext4_extent_header *eh = path->p_hdr; + struct ext4_extent *r, *l, *m; + + if (eh->eh_entries == 0) { + /* + * this leaf is empty: + * we get such a leaf in split/add case + */ + return; + } + + ext_debug("binsearch for %u: ", block); + + l = EXT_FIRST_EXTENT(eh) + 1; + r = EXT_LAST_EXTENT(eh); + + while (l <= r) { + m = l + (r - l) / 2; + if (block < le32_to_cpu(m->ee_block)) + r = m - 1; + else + l = m + 1; + ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), + m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); + } + + path->p_ext = l - 1; + ext_debug(" -> %d:%llu:[%d]%d ", + le32_to_cpu(path->p_ext->ee_block), + ext4_ext_pblock(path->p_ext), + ext4_ext_is_uninitialized(path->p_ext), + ext4_ext_get_actual_len(path->p_ext)); + +#ifdef CHECK_BINSEARCH + { + struct ext4_extent *chex, *ex; + int k; + + chex = ex = EXT_FIRST_EXTENT(eh); + for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { + BUG_ON(k && le32_to_cpu(ex->ee_block) + <= le32_to_cpu(ex[-1].ee_block)); + if (block < le32_to_cpu(ex->ee_block)) + break; + chex = ex; + } + BUG_ON(chex != path->p_ext); + } +#endif + +} + +int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +{ + struct ext4_extent_header *eh; + + eh = ext_inode_hdr(inode); + eh->eh_depth = 0; + eh->eh_entries = 0; + eh->eh_magic = EXT4_EXT_MAGIC; + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); + ext4_mark_inode_dirty(handle, inode); + ext4_ext_invalidate_cache(inode); + return 0; +} + +struct ext4_ext_path * +ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + struct buffer_head *bh; + short int depth, i, ppos = 0, alloc = 0; + + eh = ext_inode_hdr(inode); + depth = ext_depth(inode); + + /* account possible depth increase */ + if (!path) { + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), + GFP_NOFS); + if (!path) + return ERR_PTR(-ENOMEM); + alloc = 1; + } + path[0].p_hdr = eh; + path[0].p_bh = NULL; + + i = depth; + /* walk through the tree */ + while (i) { + int need_to_validate = 0; + + ext_debug("depth %d: num %d, max %d\n", + ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + + ext4_ext_binsearch_idx(inode, path + ppos, block); + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + + bh = sb_getblk(inode->i_sb, path[ppos].p_block); + if (unlikely(!bh)) + goto err; + if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, block, + path[ppos].p_block); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto err; + } + /* validate the extent entries */ + need_to_validate = 1; + } + eh = ext_block_hdr(bh); + ppos++; + if (unlikely(ppos > depth)) { + put_bh(bh); + EXT4_ERROR_INODE(inode, + "ppos %d > depth %d", ppos, depth); + goto err; + } + path[ppos].p_bh = bh; + path[ppos].p_hdr = eh; + i--; + + if (need_to_validate && ext4_ext_check(inode, eh, i)) + goto err; + } + + path[ppos].p_depth = i; + path[ppos].p_ext = NULL; + path[ppos].p_idx = NULL; + + /* find extent */ + ext4_ext_binsearch(inode, path + ppos, block); + /* if not an empty leaf */ + if (path[ppos].p_ext) + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + + ext4_ext_show_path(inode, path); + + return path; + +err: + ext4_ext_drop_refs(path); + if (alloc) + kfree(path); + return ERR_PTR(-EIO); +} + +/* + * ext4_ext_insert_index: + * insert new index [@logical;@ptr] into the block at @curp; + * check where to insert: before @curp or after @curp + */ +static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) +{ + struct ext4_extent_idx *ix; + int len, err; + + err = ext4_ext_get_access(handle, inode, curp); + if (err) + return err; + + if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { + EXT4_ERROR_INODE(inode, + "logical %d == ei_block %d!", + logical, le32_to_cpu(curp->p_idx->ei_block)); + return -EIO; + } + + if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) + >= le16_to_cpu(curp->p_hdr->eh_max))) { + EXT4_ERROR_INODE(inode, + "eh_entries %d >= eh_max %d!", + le16_to_cpu(curp->p_hdr->eh_entries), + le16_to_cpu(curp->p_hdr->eh_max)); + return -EIO; + } + + if (logical > le32_to_cpu(curp->p_idx->ei_block)) { + /* insert after */ + ext_debug("insert new index %d after: %llu\n", logical, ptr); + ix = curp->p_idx + 1; + } else { + /* insert before */ + ext_debug("insert new index %d before: %llu\n", logical, ptr); + ix = curp->p_idx; + } + + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; + BUG_ON(len < 0); + if (len > 0) { + ext_debug("insert new index %d: " + "move %d indices from 0x%p to 0x%p\n", + logical, len, ix, ix + 1); + memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); + } + + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EIO; + } + + ix->ei_block = cpu_to_le32(logical); + ext4_idx_store_pblock(ix, ptr); + le16_add_cpu(&curp->p_hdr->eh_entries, 1); + + if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); + return -EIO; + } + + err = ext4_ext_dirty(handle, inode, curp); + ext4_std_error(inode->i_sb, err); + + return err; +} + +/* + * ext4_ext_split: + * inserts new subtree into the path, using free index entry + * at depth @at: + * - allocates all needed blocks (new leaf and all intermediate index blocks) + * - makes decision where to split + * - moves remaining extents and index entries (right to the split point) + * into the newly allocated blocks + * - initializes subtree + */ +static int ext4_ext_split(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext, int at) +{ + struct buffer_head *bh = NULL; + int depth = ext_depth(inode); + struct ext4_extent_header *neh; + struct ext4_extent_idx *fidx; + int i = at, k, m, a; + ext4_fsblk_t newblock, oldblock; + __le32 border; + ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + int err = 0; + + /* make decision: where to split? */ + /* FIXME: now decision is simplest: at current extent */ + + /* if current leaf will be split, then we should use + * border from split point */ + if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); + return -EIO; + } + if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { + border = path[depth].p_ext[1].ee_block; + ext_debug("leaf will be split." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } else { + border = newext->ee_block; + ext_debug("leaf will be added." + " next leaf starts at %d\n", + le32_to_cpu(border)); + } + + /* + * If error occurs, then we break processing + * and mark filesystem read-only. index won't + * be inserted and tree will be in consistent + * state. Next mount will repair buffers too. + */ + + /* + * Get array to track all allocated blocks. + * We need this to handle errors and free blocks + * upon them. + */ + ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); + if (!ablocks) + return -ENOMEM; + + /* allocate all needed blocks */ + ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + for (a = 0; a < depth - at; a++) { + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, flags); + if (newblock == 0) + goto cleanup; + ablocks[a] = newblock; + } + + /* initialize new leaf */ + newblock = ablocks[--a]; + if (unlikely(newblock == 0)) { + EXT4_ERROR_INODE(inode, "newblock == 0!"); + err = -EIO; + goto cleanup; + } + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = 0; + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_depth = 0; + + /* move remainder of path[depth] to the new leaf */ + if (unlikely(path[depth].p_hdr->eh_entries != + path[depth].p_hdr->eh_max)) { + EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", + path[depth].p_hdr->eh_entries, + path[depth].p_hdr->eh_max); + err = -EIO; + goto cleanup; + } + /* start copy from next extent */ + m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; + ext4_ext_show_move(inode, path, newblock, depth); + if (m) { + struct ext4_extent *ex; + ex = EXT_FIRST_EXTENT(neh); + memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); + le16_add_cpu(&neh->eh_entries, m); + } + + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old leaf */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto cleanup; + + } + + /* create intermediate indexes */ + k = depth - at - 1; + if (unlikely(k < 0)) { + EXT4_ERROR_INODE(inode, "k %d < 0!", k); + err = -EIO; + goto cleanup; + } + if (k) + ext_debug("create %d intermediate indices\n", k); + /* insert new index into current index block */ + /* current depth stored in i var */ + i = depth - 1; + while (k--) { + oldblock = newblock; + newblock = ablocks[--a]; + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + goto cleanup; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) + goto cleanup; + + neh = ext_block_hdr(bh); + neh->eh_entries = cpu_to_le16(1); + neh->eh_magic = EXT4_EXT_MAGIC; + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + neh->eh_depth = cpu_to_le16(depth - i); + fidx = EXT_FIRST_INDEX(neh); + fidx->ei_block = border; + ext4_idx_store_pblock(fidx, oldblock); + + ext_debug("int.index at %d (block %llu): %u -> %llu\n", + i, newblock, le32_to_cpu(border), oldblock); + + /* move remainder of path[i] to the new index block */ + if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != + EXT_LAST_INDEX(path[i].p_hdr))) { + EXT4_ERROR_INODE(inode, + "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", + le32_to_cpu(path[i].p_ext->ee_block)); + err = -EIO; + goto cleanup; + } + /* start copy indexes */ + m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; + ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + EXT_MAX_INDEX(path[i].p_hdr)); + ext4_ext_show_move(inode, path, newblock, i); + if (m) { + memmove(++fidx, path[i].p_idx, + sizeof(struct ext4_extent_idx) * m); + le16_add_cpu(&neh->eh_entries, m); + } + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto cleanup; + brelse(bh); + bh = NULL; + + /* correct old index */ + if (m) { + err = ext4_ext_get_access(handle, inode, path + i); + if (err) + goto cleanup; + le16_add_cpu(&path[i].p_hdr->eh_entries, -m); + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + goto cleanup; + } + + i--; + } + + /* insert new index */ + err = ext4_ext_insert_index(handle, inode, path + at, + le32_to_cpu(border), newblock); + +cleanup: + if (bh) { + if (buffer_locked(bh)) + unlock_buffer(bh); + brelse(bh); + } + + if (err) { + /* free all allocated blocks in error case */ + for (i = 0; i < depth; i++) { + if (!ablocks[i]) + continue; + ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, + EXT4_FREE_BLOCKS_METADATA); + } + } + kfree(ablocks); + + return err; +} + +/* + * ext4_ext_grow_indepth: + * implements tree growing procedure: + * - allocates new block + * - moves top-level data (index block or leaf) into the new block + * - initializes new top-level, creating index that points to the + * just created block + */ +static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_extent *newext) +{ + struct ext4_extent_header *neh; + struct buffer_head *bh; + ext4_fsblk_t newblock; + int err = 0; + + newblock = ext4_ext_new_meta_block(handle, inode, NULL, + newext, &err, flags); + if (newblock == 0) + return err; + + bh = sb_getblk(inode->i_sb, newblock); + if (!bh) { + err = -EIO; + ext4_std_error(inode->i_sb, err); + return err; + } + lock_buffer(bh); + + err = ext4_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + goto out; + } + + /* move top-level index/leaf into new block */ + memmove(bh->b_data, EXT4_I(inode)->i_data, + sizeof(EXT4_I(inode)->i_data)); + + /* set size of new block */ + neh = ext_block_hdr(bh); + /* old root could have indexes or leaves + * so calculate e_max right way */ + if (ext_depth(inode)) + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); + else + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); + neh->eh_magic = EXT4_EXT_MAGIC; + set_buffer_uptodate(bh); + unlock_buffer(bh); + + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto out; + + /* Update top-level index: num,max,pointer */ + neh = ext_inode_hdr(inode); + neh->eh_entries = cpu_to_le16(1); + ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); + if (neh->eh_depth == 0) { + /* Root extent block becomes index block */ + neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); + EXT_FIRST_INDEX(neh)->ei_block = + EXT_FIRST_EXTENT(neh)->ee_block; + } + ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + ext4_idx_pblock(EXT_FIRST_INDEX(neh))); + + neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); + ext4_mark_inode_dirty(handle, inode); +out: + brelse(bh); + + return err; +} + +/* + * ext4_ext_create_new_leaf: + * finds empty index and adds new leaf. + * if no free index is found, then it requests in-depth growing. + */ +static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) +{ + struct ext4_ext_path *curp; + int depth, i, err = 0; + +repeat: + i = depth = ext_depth(inode); + + /* walk up to the tree and look for free index entry */ + curp = path + depth; + while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { + i--; + curp--; + } + + /* we use already allocated block for index block, + * so subsequent data blocks should be contiguous */ + if (EXT_HAS_FREE_INDEX(curp)) { + /* if we found index with free entry, then use that + * entry: create all needed subtree and add new leaf */ + err = ext4_ext_split(handle, inode, flags, path, newext, i); + if (err) + goto out; + + /* refill path */ + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) + err = PTR_ERR(path); + } else { + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, flags, newext); + if (err) + goto out; + + /* refill path */ + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + (ext4_lblk_t)le32_to_cpu(newext->ee_block), + path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; + } + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; + } + } + +out: + return err; +} + +/* + * search the closest allocated block to the left for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the smallest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_left(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys) +{ + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + int depth, ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", + *logical, le32_to_cpu(ex->ee_block)); + return -EIO; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", + ix != NULL ? le32_to_cpu(ix->ei_block) : 0, + EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? + le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, + depth); + return -EIO; + } + } + return 0; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } + + *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; + *phys = ext4_ext_pblock(ex) + ee_len - 1; + return 0; +} + +/* + * search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys + * if *logical is the largest allocated block, the function + * returns 0 at @phys + * return value contains 0 (success) or error code + */ +static int ext4_ext_search_right(struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t *logical, ext4_fsblk_t *phys, + struct ext4_extent **ret_ex) +{ + struct buffer_head *bh = NULL; + struct ext4_extent_header *eh; + struct ext4_extent_idx *ix; + struct ext4_extent *ex; + ext4_fsblk_t block; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; + + if (unlikely(path == NULL)) { + EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); + return -EIO; + } + depth = path->p_depth; + *phys = 0; + + if (depth == 0 && path->p_ext == NULL) + return 0; + + /* usually extent in the path covers blocks smaller + * then *logical, but it can be that extent is the + * first one in the file */ + + ex = path[depth].p_ext; + ee_len = ext4_ext_get_actual_len(ex); + if (*logical < le32_to_cpu(ex->ee_block)) { + if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { + EXT4_ERROR_INODE(inode, + "first_extent(path[%d].p_hdr) != ex", + depth); + return -EIO; + } + while (--depth >= 0) { + ix = path[depth].p_idx; + if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { + EXT4_ERROR_INODE(inode, + "ix != EXT_FIRST_INDEX *logical %d!", + *logical); + return -EIO; + } + } + goto found_extent; + } + + if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { + EXT4_ERROR_INODE(inode, + "logical %d < ee_block %d + ee_len %d!", + *logical, le32_to_cpu(ex->ee_block), ee_len); + return -EIO; + } + + if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { + /* next allocated block in this leaf */ + ex++; + goto found_extent; + } + + /* go up and search for index to the right */ + while (--depth >= 0) { + ix = path[depth].p_idx; + if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) + goto got_index; + } + + /* we've gone up to the root and found no index to the right */ + return 0; + +got_index: + /* we've found index to the right, let's + * follow it and find the closest allocated + * block to the right */ + ix++; + block = ext4_idx_pblock(ix); + while (++depth < path->p_depth) { + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + put_bh(bh); + return -EIO; + } + ix = EXT_FIRST_INDEX(eh); + block = ext4_idx_pblock(ix); + put_bh(bh); + } + + bh = sb_bread(inode->i_sb, block); + if (bh == NULL) + return -EIO; + eh = ext_block_hdr(bh); + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + put_bh(bh); + return -EIO; + } + ex = EXT_FIRST_EXTENT(eh); +found_extent: + *logical = le32_to_cpu(ex->ee_block); + *phys = ext4_ext_pblock(ex); + *ret_ex = ex; + if (bh) + put_bh(bh); + return 0; +} + +/* + * ext4_ext_next_allocated_block: + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. + * NOTE: it considers block number from index entry as + * allocated block. Thus, index entries have to be consistent + * with leaves. + */ +static ext4_lblk_t +ext4_ext_next_allocated_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + if (depth == 0 && path->p_ext == NULL) + return EXT_MAX_BLOCKS; + + while (depth >= 0) { + if (depth == path->p_depth) { + /* leaf */ + if (path[depth].p_ext && + path[depth].p_ext != + EXT_LAST_EXTENT(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_ext[1].ee_block); + } else { + /* index */ + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return le32_to_cpu(path[depth].p_idx[1].ei_block); + } + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_next_leaf_block: + * returns first allocated block from next leaf or EXT_MAX_BLOCKS + */ +static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) +{ + int depth; + + BUG_ON(path == NULL); + depth = path->p_depth; + + /* zero-tree has no leaf blocks at all */ + if (depth == 0) + return EXT_MAX_BLOCKS; + + /* go to index block */ + depth--; + + while (depth >= 0) { + if (path[depth].p_idx != + EXT_LAST_INDEX(path[depth].p_hdr)) + return (ext4_lblk_t) + le32_to_cpu(path[depth].p_idx[1].ei_block); + depth--; + } + + return EXT_MAX_BLOCKS; +} + +/* + * ext4_ext_correct_indexes: + * if leaf gets modified and modified extent is first in the leaf, + * then we have to correct all indexes above. + * TODO: do we need to correct tree in all cases? + */ +static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + int depth = ext_depth(inode); + struct ext4_extent *ex; + __le32 border; + int k, err = 0; + + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + + if (unlikely(ex == NULL || eh == NULL)) { + EXT4_ERROR_INODE(inode, + "ex %p == NULL or eh %p == NULL", ex, eh); + return -EIO; + } + + if (depth == 0) { + /* there is no tree at all */ + return 0; + } + + if (ex != EXT_FIRST_EXTENT(eh)) { + /* we correct tree if first leaf got modified only */ + return 0; + } + + /* + * TODO: we need correction if border is smaller than current one + */ + k = depth - 1; + border = path[depth].p_ext->ee_block; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + return err; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + return err; + + while (k--) { + /* change all left-side indexes */ + if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) + break; + err = ext4_ext_get_access(handle, inode, path + k); + if (err) + break; + path[k].p_idx->ei_block = border; + err = ext4_ext_dirty(handle, inode, path + k); + if (err) + break; + } + + return err; +} + +int +ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, + struct ext4_extent *ex2) +{ + unsigned short ext1_ee_len, ext2_ee_len, max_len; + + /* + * Make sure that either both extents are uninitialized, or + * both are _not_. + */ + if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + return 0; + + if (ext4_ext_is_uninitialized(ex1)) + max_len = EXT_UNINIT_MAX_LEN; + else + max_len = EXT_INIT_MAX_LEN; + + ext1_ee_len = ext4_ext_get_actual_len(ex1); + ext2_ee_len = ext4_ext_get_actual_len(ex2); + + if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != + le32_to_cpu(ex2->ee_block)) + return 0; + + /* + * To allow future support for preallocated extents to be added + * as an RO_COMPAT feature, refuse to merge to extents if + * this can result in the top bit of ee_len being set. + */ + if (ext1_ee_len + ext2_ee_len > max_len) + return 0; +#ifdef AGGRESSIVE_TEST + if (ext1_ee_len >= 4) + return 0; +#endif + + if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) + return 1; + return 0; +} + +/* + * This function tries to merge the "ex" extent to the next extent in the tree. + * It always tries to merge towards right. If you want to merge towards + * left, pass "ex - 1" as argument instead of "ex". + * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns + * 1 if they got merged. + */ +static int ext4_ext_try_to_merge_right(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) +{ + struct ext4_extent_header *eh; + unsigned int depth, len; + int merge_done = 0; + int uninitialized = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + while (ex < EXT_LAST_EXTENT(eh)) { + if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) + break; + /* merge with next extent! */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(ex + 1)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + + if (ex + 1 < EXT_LAST_EXTENT(eh)) { + len = (EXT_LAST_EXTENT(eh) - ex - 1) + * sizeof(struct ext4_extent); + memmove(ex + 1, ex + 2, len); + } + le16_add_cpu(&eh->eh_entries, -1); + merge_done = 1; + WARN_ON(eh->eh_entries == 0); + if (!eh->eh_entries) + EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); + } + + return merge_done; +} + +/* + * This function tries to merge the @ex extent to neighbours in the tree. + * return 1 if merge left else 0. + */ +static int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *ex) { + struct ext4_extent_header *eh; + unsigned int depth; + int merge_done = 0; + int ret = 0; + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + eh = path[depth].p_hdr; + + if (ex > EXT_FIRST_EXTENT(eh)) + merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); + + if (!merge_done) + ret = ext4_ext_try_to_merge_right(inode, path, ex); + + return ret; +} + +/* + * check if a portion of the "newext" extent overlaps with an + * existing extent. + * + * If there is an overlap discovered, it updates the length of the newext + * such that there will be no overlap, and then returns 1. + * If there is no overlap found, it returns 0. + */ +static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, + struct inode *inode, + struct ext4_extent *newext, + struct ext4_ext_path *path) +{ + ext4_lblk_t b1, b2; + unsigned int depth, len1; + unsigned int ret = 0; + + b1 = le32_to_cpu(newext->ee_block); + len1 = ext4_ext_get_actual_len(newext); + depth = ext_depth(inode); + if (!path[depth].p_ext) + goto out; + b2 = le32_to_cpu(path[depth].p_ext->ee_block); + b2 &= ~(sbi->s_cluster_ratio - 1); + + /* + * get the next allocated block if the extent in the path + * is before the requested block(s) + */ + if (b2 < b1) { + b2 = ext4_ext_next_allocated_block(path); + if (b2 == EXT_MAX_BLOCKS) + goto out; + b2 &= ~(sbi->s_cluster_ratio - 1); + } + + /* check for wrap through zero on extent logical start block*/ + if (b1 + len1 < b1) { + len1 = EXT_MAX_BLOCKS - b1; + newext->ee_len = cpu_to_le16(len1); + ret = 1; + } + + /* check for overlap */ + if (b1 + len1 > b2) { + newext->ee_len = cpu_to_le16(b2 - b1); + ret = 1; + } +out: + return ret; +} + +/* + * ext4_ext_insert_extent: + * tries to merge requsted extent into the existing extent or + * inserts requested extent as new one into the tree, + * creating new leaf in the no-space case. + */ +int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int flag) +{ + struct ext4_extent_header *eh; + struct ext4_extent *ex, *fex; + struct ext4_extent *nearex; /* nearest extent */ + struct ext4_ext_path *npath = NULL; + int depth, len, err; + ext4_lblk_t next; + unsigned uninitialized = 0; + int flags = 0; + + if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { + EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); + return -EIO; + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } + + /* try to insert block into found extent and return */ + if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) + && ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), + ext4_ext_get_actual_len(ex), + ext4_ext_pblock(ex)); + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + + /* + * ext4_can_extents_be_merged should have checked that either + * both extents are uninitialized, or both aren't. Thus we + * need to check only one of them here. + */ + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + + ext4_ext_get_actual_len(newext)); + if (uninitialized) + ext4_ext_mark_uninitialized(ex); + eh = path[depth].p_hdr; + nearex = ex; + goto merge; + } + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) + goto has_space; + + /* probably next leaf has space for us? */ + fex = EXT_LAST_EXTENT(eh); + next = EXT_MAX_BLOCKS; + if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) + next = ext4_ext_next_leaf_block(path); + if (next != EXT_MAX_BLOCKS) { + ext_debug("next leaf block - %u\n", next); + BUG_ON(npath != NULL); + npath = ext4_ext_find_extent(inode, next, NULL); + if (IS_ERR(npath)) + return PTR_ERR(npath); + BUG_ON(npath->p_depth != path->p_depth); + eh = npath[depth].p_hdr; + if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { + ext_debug("next leaf isn't full(%d)\n", + le16_to_cpu(eh->eh_entries)); + path = npath; + goto has_space; + } + ext_debug("next leaf has no free space(%d,%d)\n", + le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + } + + /* + * There is no free space in the found leaf. + * We're gonna add a new leaf in the tree. + */ + if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) + flags = EXT4_MB_USE_ROOT_BLOCKS; + err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); + if (err) + goto cleanup; + depth = ext_depth(inode); + eh = path[depth].p_hdr; + +has_space: + nearex = path[depth].p_ext; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto cleanup; + + if (!nearex) { + /* there is no extent in this leaf, create first one */ + ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext)); + nearex = EXT_FIRST_EXTENT(eh); + } else { + if (le32_to_cpu(newext->ee_block) + > le32_to_cpu(nearex->ee_block)) { + /* Insert after */ + ext_debug("insert %u:%llu:[%d]%d before: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex); + nearex++; + } else { + /* Insert before */ + BUG_ON(newext->ee_block == nearex->ee_block); + ext_debug("insert %u:%llu:[%d]%d after: " + "nearest %p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + nearex); + } + len = EXT_LAST_EXTENT(eh) - nearex + 1; + if (len > 0) { + ext_debug("insert %u:%llu:[%d]%d: " + "move %d extents from 0x%p to 0x%p\n", + le32_to_cpu(newext->ee_block), + ext4_ext_pblock(newext), + ext4_ext_is_uninitialized(newext), + ext4_ext_get_actual_len(newext), + len, nearex, nearex + 1); + memmove(nearex + 1, nearex, + len * sizeof(struct ext4_extent)); + } + } + + le16_add_cpu(&eh->eh_entries, 1); + path[depth].p_ext = nearex; + nearex->ee_block = newext->ee_block; + ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); + nearex->ee_len = newext->ee_len; + +merge: + /* try to merge extents to the right */ + if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(inode, path, nearex); + + /* try to merge extents to the left */ + + /* time to correct all indexes above */ + err = ext4_ext_correct_indexes(handle, inode, path); + if (err) + goto cleanup; + + err = ext4_ext_dirty(handle, inode, path + depth); + +cleanup: + if (npath) { + ext4_ext_drop_refs(npath); + kfree(npath); + } + ext4_ext_invalidate_cache(inode); + return err; +} + +static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCKS) { + num = last - block; + /* find extent for this block */ + down_read(&EXT4_I(inode)->i_data_sem); + path = ext4_ext_find_extent(inode, block, path); + up_read(&EXT4_I(inode)->i_data_sem); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + err = -EIO; + break; + } + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext4_ext_pblock(ex); + } + + if (unlikely(cbex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); + err = -EIO; + break; + } + err = func(inode, next, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + +static void +ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, + __u32 len, ext4_fsblk_t start) +{ + struct ext4_ext_cache *cex; + BUG_ON(len == 0); + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + trace_ext4_ext_put_in_cache(inode, block, len, start); + cex = &EXT4_I(inode)->i_cached_extent; + cex->ec_block = block; + cex->ec_len = len; + cex->ec_start = start; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +/* + * ext4_ext_put_gap_in_cache: + * calculate boundaries of the gap that the requested block fits into + * and cache this gap + */ +static void +ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, + ext4_lblk_t block) +{ + int depth = ext_depth(inode); + unsigned long len; + ext4_lblk_t lblock; + struct ext4_extent *ex; + + ex = path[depth].p_ext; + if (ex == NULL) { + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; + ext_debug("cache gap(whole file):"); + } else if (block < le32_to_cpu(ex->ee_block)) { + lblock = block; + len = le32_to_cpu(ex->ee_block) - block; + ext_debug("cache gap(before): %u [%u:%u]", + block, + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + ext4_lblk_t next; + lblock = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + + next = ext4_ext_next_allocated_block(path); + ext_debug("cache gap(after): [%u:%u] %u", + le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex), + block); + BUG_ON(next == lblock); + len = next - lblock; + } else { + lblock = len = 0; + BUG(); + } + + ext_debug(" -> %u:%lu\n", lblock, len); + ext4_ext_put_in_cache(inode, lblock, len, 0); +} + +/* + * ext4_ext_check_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * cache extent pointer. If the cached extent is a hole, + * this routine should be used instead of + * ext4_ext_in_cache if the calling function needs to + * know the size of the hole. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * + * Return 0 if cache is invalid; 1 if the cache is valid + */ +static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_cache *ex){ + struct ext4_ext_cache *cex; + struct ext4_sb_info *sbi; + int ret = 0; + + /* + * We borrow i_block_reservation_lock to protect i_cached_extent + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + cex = &EXT4_I(inode)->i_cached_extent; + sbi = EXT4_SB(inode->i_sb); + + /* has cache valid data? */ + if (cex->ec_len == 0) + goto errout; + + if (in_range(block, cex->ec_block, cex->ec_len)) { + memcpy(ex, cex, sizeof(struct ext4_ext_cache)); + ext_debug("%u cached by %u:%u:%llu\n", + block, + cex->ec_block, cex->ec_len, cex->ec_start); + ret = 1; + } +errout: + trace_ext4_ext_in_cache(inode, block, ret); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return ret; +} + +/* + * ext4_ext_in_cache() + * Checks to see if the given block is in the cache. + * If it is, the cached extent is stored in the given + * extent pointer. + * + * @inode: The files inode + * @block: The block to look for in the cache + * @ex: Pointer where the cached extent will be stored + * if it contains block + * + * Return 0 if cache is invalid; 1 if the cache is valid + */ +static int +ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, + struct ext4_extent *ex) +{ + struct ext4_ext_cache cex; + int ret = 0; + + if (ext4_ext_check_cache(inode, block, &cex)) { + ex->ee_block = cpu_to_le32(cex.ec_block); + ext4_ext_store_pblock(ex, cex.ec_start); + ex->ee_len = cpu_to_le16(cex.ec_len); + ret = 1; + } + + return ret; +} + + +/* + * ext4_ext_rm_idx: + * removes index from the index block. + */ +static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int err; + ext4_fsblk_t leaf; + + /* free index block */ + path--; + leaf = ext4_idx_pblock(path->p_idx); + if (unlikely(path->p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + return -EIO; + } + err = ext4_ext_get_access(handle, inode, path); + if (err) + return err; + + if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { + int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + len *= sizeof(struct ext4_extent_idx); + memmove(path->p_idx, path->p_idx + 1, len); + } + + le16_add_cpu(&path->p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path); + if (err) + return err; + ext_debug("index is empty, remove it, free block %llu\n", leaf); + trace_ext4_ext_rm_idx(inode, leaf); + + ext4_free_blocks(handle, inode, NULL, leaf, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + return err; +} + +/* + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. + */ +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, + struct ext4_ext_path *path) +{ + if (path) { + int depth = ext_depth(inode); + int ret = 0; + + /* probably there is space in leaf? */ + if (le16_to_cpu(path[depth].p_hdr->eh_entries) + < le16_to_cpu(path[depth].p_hdr->eh_max)) { + + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadata blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; + } + } + + return ext4_chunk_trans_blocks(inode, nrblocks); +} + +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); + + if (chunk) + index = depth * 2; + else + index = depth * 3; + + return index; +} + +static int ext4_remove_blocks(handle_t *handle, struct inode *inode, + struct ext4_extent *ex, + ext4_fsblk_t *partial_cluster, + ext4_lblk_t from, ext4_lblk_t to) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + ext4_fsblk_t pblk; + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + /* + * For bigalloc file systems, we never free a partial cluster + * at the beginning of the extent. Instead, we make a note + * that we tried freeing the cluster, and check to see if we + * need to free it on a subsequent call to ext4_remove_blocks, + * or at the end of the ext4_truncate() operation. + */ + flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; + + trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); + /* + * If we have a partial cluster, and it's different from the + * cluster of the last block, we need to explicitly free the + * partial cluster here. + */ + pblk = ext4_ext_pblock(ex) + ee_len - 1; + if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + +#ifdef EXTENTS_STATS + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + spin_lock(&sbi->s_ext_stats_lock); + sbi->s_ext_blocks += ee_len; + sbi->s_ext_extents++; + if (ee_len < sbi->s_ext_min) + sbi->s_ext_min = ee_len; + if (ee_len > sbi->s_ext_max) + sbi->s_ext_max = ee_len; + if (ext_depth(inode) > sbi->s_depth_max) + sbi->s_depth_max = ext_depth(inode); + spin_unlock(&sbi->s_ext_stats_lock); + } +#endif + if (from >= le32_to_cpu(ex->ee_block) + && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { + /* tail removal */ + ext4_lblk_t num; + + num = le32_to_cpu(ex->ee_block) + ee_len - from; + pblk = ext4_ext_pblock(ex) + ee_len - num; + ext_debug("free last %u blocks starting %llu\n", num, pblk); + ext4_free_blocks(handle, inode, NULL, pblk, num, flags); + /* + * If the block range to be freed didn't start at the + * beginning of a cluster, and we removed the entire + * extent, save the partial cluster here, since we + * might need to delete if we determine that the + * truncate operation has removed all of the blocks in + * the cluster. + */ + if (pblk & (sbi->s_cluster_ratio - 1) && + (ee_len == num)) + *partial_cluster = EXT4_B2C(sbi, pblk); + else + *partial_cluster = 0; + } else if (from == le32_to_cpu(ex->ee_block) + && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { + /* head removal */ + ext4_lblk_t num; + ext4_fsblk_t start; + + num = to - from; + start = ext4_ext_pblock(ex); + + ext_debug("free first %u blocks starting %llu\n", num, start); + ext4_free_blocks(handle, inode, NULL, start, num, flags); + + } else { + printk(KERN_INFO "strange request: removal(2) " + "%u-%u from %u:%u\n", + from, to, le32_to_cpu(ex->ee_block), ee_len); + } + return 0; +} + + +/* + * ext4_ext_rm_leaf() Removes the extents associated with the + * blocks appearing between "start" and "end", and splits the extents + * if "start" and "end" appear in the same extent + * + * @handle: The journal handle + * @inode: The files inode + * @path: The path to the leaf + * @start: The first block to remove + * @end: The last block to remove + */ +static int +ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int err = 0, correct_index = 0; + int depth = ext_depth(inode), credits; + struct ext4_extent_header *eh; + ext4_lblk_t a, b; + unsigned num; + ext4_lblk_t ex_ee_block; + unsigned short ex_ee_len; + unsigned uninitialized = 0; + struct ext4_extent *ex; + + /* the header must be checked already in ext4_ext_remove_space() */ + ext_debug("truncate since %u in leaf to %u\n", start, end); + if (!path[depth].p_hdr) + path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); + eh = path[depth].p_hdr; + if (unlikely(path[depth].p_hdr == NULL)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); + return -EIO; + } + /* find where to start removing */ + ex = EXT_LAST_EXTENT(eh); + + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); + + while (ex >= EXT_FIRST_EXTENT(eh) && + ex_ee_block + ex_ee_len > start) { + + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + else + uninitialized = 0; + + ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + uninitialized, ex_ee_len); + path[depth].p_ext = ex; + + a = ex_ee_block > start ? ex_ee_block : start; + b = ex_ee_block+ex_ee_len - 1 < end ? + ex_ee_block+ex_ee_len - 1 : end; + + ext_debug(" border %u:%u\n", a, b); + + /* If this extent is beyond the end of the hole, skip it */ + if (end < ex_ee_block) { + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + continue; + } else if (b != ex_ee_block + ex_ee_len - 1) { + EXT4_ERROR_INODE(inode, + "can not handle truncate %u:%u " + "on extent %u:%u", + start, end, ex_ee_block, + ex_ee_block + ex_ee_len - 1); + err = -EIO; + goto out; + } else if (a != ex_ee_block) { + /* remove tail of the extent */ + num = a - ex_ee_block; + } else { + /* remove whole extent: excellent! */ + num = 0; + } + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); + if (ex == EXT_FIRST_EXTENT(eh)) { + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + if (err) + goto out; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + err = ext4_remove_blocks(handle, inode, ex, partial_cluster, + a, b); + if (err) + goto out; + + if (num == 0) + /* this extent is removed; mark slot entirely unused */ + ext4_ext_store_pblock(ex, 0); + + ex->ee_len = cpu_to_le16(num); + /* + * Do not mark uninitialized if all the blocks in the + * extent have been removed. + */ + if (uninitialized && num) + ext4_ext_mark_uninitialized(ex); + /* + * If the extent was completely released, + * we need to remove it from the leaf + */ + if (num == 0) { + if (end != EXT_MAX_BLOCKS - 1) { + /* + * For hole punching, we need to scoot all the + * extents up when an extent is removed so that + * we dont have blank extents in the middle + */ + memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * + sizeof(struct ext4_extent)); + + /* Now get rid of the one at the end */ + memset(EXT_LAST_EXTENT(eh), 0, + sizeof(struct ext4_extent)); + } + le16_add_cpu(&eh->eh_entries, -1); + } else + *partial_cluster = 0; + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, + ext4_ext_pblock(ex)); + ex--; + ex_ee_block = le32_to_cpu(ex->ee_block); + ex_ee_len = ext4_ext_get_actual_len(ex); + } + + if (correct_index && eh->eh_entries) + err = ext4_ext_correct_indexes(handle, inode, path); + + /* + * If there is still a entry in the leaf node, check to see if + * it references the partial cluster. This is the only place + * where it could; if it doesn't, we can free the cluster. + */ + if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && + (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != + *partial_cluster)) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, flags); + *partial_cluster = 0; + } + + /* if this leaf is free, then we should + * remove it from index block above */ + if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) + err = ext4_ext_rm_idx(handle, inode, path + depth); + +out: + return err; +} + +/* + * ext4_ext_more_to_rm: + * returns 1 if current index has to be freed (even partial) + */ +static int +ext4_ext_more_to_rm(struct ext4_ext_path *path) +{ + BUG_ON(path->p_idx == NULL); + + if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) + return 0; + + /* + * if truncate on deeper level happened, it wasn't partial, + * so we have to consider current index for truncation + */ + if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) + return 0; + return 1; +} + +static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t end) +{ + struct super_block *sb = inode->i_sb; + int depth = ext_depth(inode); + struct ext4_ext_path *path; + ext4_fsblk_t partial_cluster = 0; + handle_t *handle; + int i, err; + + ext_debug("truncate since %u to %u\n", start, end); + + /* probably first extent we're gonna free will be last in block */ + handle = ext4_journal_start(inode, depth + 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +again: + ext4_ext_invalidate_cache(inode); + + trace_ext4_ext_remove_space(inode, start, depth); + + /* + * Check if we are removing extents inside the extent tree. If that + * is the case, we are going to punch a hole inside the extent tree + * so we have to check whether we need to split the extent covering + * the last block to remove so we can easily remove the part of it + * in ext4_ext_rm_leaf(). + */ + if (end < EXT_MAX_BLOCKS - 1) { + struct ext4_extent *ex; + ext4_lblk_t ee_block; + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, end, NULL); + if (IS_ERR(path)) { + ext4_journal_stop(handle); + return PTR_ERR(path); + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) + goto cont; + + ee_block = le32_to_cpu(ex->ee_block); + + /* + * See if the last block is inside the extent, if so split + * the extent at 'end' block so we can easily remove the + * tail of the first part of the split extent in + * ext4_ext_rm_leaf(). + */ + if (end >= ee_block && + end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + int split_flag = 0; + + if (ext4_ext_is_uninitialized(ex)) + split_flag = EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + + /* + * Split the extent in two so that 'end' is the last + * block in the first new extent + */ + err = ext4_split_extent_at(handle, inode, path, + end + 1, split_flag, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_PUNCH_OUT_EXT); + + if (err < 0) + goto out; + } + ext4_ext_drop_refs(path); + kfree(path); + } +cont: + + /* + * We start scanning from right side, freeing all the blocks + * after i_size and walking into the tree depth-wise. + */ + depth = ext_depth(inode); + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } + path[0].p_depth = depth; + path[0].p_hdr = ext_inode_hdr(inode); + + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + err = -EIO; + goto out; + } + i = err = 0; + + while (i >= 0 && err == 0) { + if (i == depth) { + /* this is leaf block */ + err = ext4_ext_rm_leaf(handle, inode, path, + &partial_cluster, start, + end); + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + continue; + } + + /* this is index block */ + if (!path[i].p_hdr) { + ext_debug("initialize header\n"); + path[i].p_hdr = ext_block_hdr(path[i].p_bh); + } + + if (!path[i].p_idx) { + /* this level hasn't been touched yet */ + path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; + ext_debug("init index ptr: hdr 0x%p, num %d\n", + path[i].p_hdr, + le16_to_cpu(path[i].p_hdr->eh_entries)); + } else { + /* we were already here, see at next index */ + path[i].p_idx--; + } + + ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + i, EXT_FIRST_INDEX(path[i].p_hdr), + path[i].p_idx); + if (ext4_ext_more_to_rm(path + i)) { + struct buffer_head *bh; + /* go to the next level */ + ext_debug("move to level %d (block %llu)\n", + i + 1, ext4_idx_pblock(path[i].p_idx)); + memset(path + i + 1, 0, sizeof(*path)); + bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); + if (!bh) { + /* should we reset i_size? */ + err = -EIO; + break; + } + if (WARN_ON(i + 1 > depth)) { + err = -EIO; + break; + } + if (ext4_ext_check(inode, ext_block_hdr(bh), + depth - i - 1)) { + err = -EIO; + break; + } + path[i + 1].p_bh = bh; + + /* save actual number of indexes since this + * number is changed at the next iteration */ + path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); + i++; + } else { + /* we finished processing this index, go up */ + if (path[i].p_hdr->eh_entries == 0 && i > 0) { + /* index is empty, remove it; + * handle must be already prepared by the + * truncatei_leaf() */ + err = ext4_ext_rm_idx(handle, inode, path + i); + } + /* root level has p_bh == NULL, brelse() eats this */ + brelse(path[i].p_bh); + path[i].p_bh = NULL; + i--; + ext_debug("return to level %d\n", i); + } + } + + trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, + path->p_hdr->eh_entries); + + /* If we still have something in the partial cluster and we have removed + * even the first extent, then we should free the blocks in the partial + * cluster as well. */ + if (partial_cluster && path->p_hdr->eh_entries == 0) { + int flags = EXT4_FREE_BLOCKS_FORGET; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(EXT4_SB(sb), partial_cluster), + EXT4_SB(sb)->s_cluster_ratio, flags); + partial_cluster = 0; + } + + /* TODO: flexible tree reduction should be here */ + if (path->p_hdr->eh_entries == 0) { + /* + * truncate to zero freed all the tree, + * so we need to correct eh_depth + */ + err = ext4_ext_get_access(handle, inode, path); + if (err == 0) { + ext_inode_hdr(inode)->eh_depth = 0; + ext_inode_hdr(inode)->eh_max = + cpu_to_le16(ext4_ext_space_root(inode, 0)); + err = ext4_ext_dirty(handle, inode, path); + } + } +out: + ext4_ext_drop_refs(path); + kfree(path); + if (err == -EAGAIN) + goto again; + ext4_journal_stop(handle); + + return err; +} + +/* + * called at mount time + */ +void ext4_ext_init(struct super_block *sb) +{ + /* + * possible initialization would be here + */ + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { +#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) + printk(KERN_INFO "EXT4-fs: file extents enabled" +#ifdef AGGRESSIVE_TEST + ", aggressive tests" +#endif +#ifdef CHECK_BINSEARCH + ", check binsearch" +#endif +#ifdef EXTENTS_STATS + ", stats" +#endif + "\n"); +#endif +#ifdef EXTENTS_STATS + spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); + EXT4_SB(sb)->s_ext_min = 1 << 30; + EXT4_SB(sb)->s_ext_max = 0; +#endif + } +} + +/* + * called at umount time + */ +void ext4_ext_release(struct super_block *sb) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return; + +#ifdef EXTENTS_STATS + if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", + sbi->s_ext_blocks, sbi->s_ext_extents, + sbi->s_ext_blocks / sbi->s_ext_extents); + printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", + sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); + } +#endif +} + +/* FIXME!! we need to try to merge to left or right after zero-out */ +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + int ret; + + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + +/* + * ext4_split_extent_at() splits an extent at given block. + * + * @handle: the journal handle + * @inode: the file inode + * @path: the path to the extent + * @split: the logical block where the extent is splitted. + * @split_flags: indicates if the extent could be zeroout if split fails, and + * the states(init or uninit) of new extents. + * @flags: flags used to insert new extent to extent tree. + * + * + * Splits extent [a, b] into two extents [a, @split) and [@split, b], states + * of which are deterimined by split_flag. + * + * There are two cases: + * a> the extent are splitted into two extent. + * b> split is not needed, and just mark the extent. + * + * return 0 on success. + */ +static int ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, + int flags) +{ + ext4_fsblk_t newblock; + ext4_lblk_t ee_block; + struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex2 = NULL; + unsigned int ee_len, depth; + int err = 0; + + ext_debug("ext4_split_extents_at: inode %lu, logical" + "block %llu\n", inode->i_ino, (unsigned long long)split); + + ext4_ext_show_leaf(inode, path); + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + newblock = split - ee_block + ext4_ext_pblock(ex); + + BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + if (split == ee_block) { + /* + * case b: block @split is the block that the extent begins with + * then we just change the state of the extent, and splitting + * is not needed. + */ + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex); + else + ext4_ext_mark_initialized(ex); + + if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) + ext4_ext_try_to_merge(inode, path, ex); + + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } + + /* case a */ + memcpy(&orig_ex, ex, sizeof(orig_ex)); + ex->ee_len = cpu_to_le16(split - ee_block); + if (split_flag & EXT4_EXT_MARK_UNINIT1) + ext4_ext_mark_uninitialized(ex); + + /* + * path may lead to new leaf, not to original leaf any more + * after ext4_ext_insert_extent() returns, + */ + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto fix_extent_len; + + ex2 = &newex; + ex2->ee_block = cpu_to_le32(split); + ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); + ext4_ext_store_pblock(ex2, newblock); + if (split_flag & EXT4_EXT_MARK_UNINIT2) + ext4_ext_mark_uninitialized(ex2); + + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_len = cpu_to_le16(ee_len); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } else if (err) + goto fix_extent_len; + +out: + ext4_ext_show_leaf(inode, path); + return err; + +fix_extent_len: + ex->ee_len = orig_ex.ee_len; + ext4_ext_dirty(handle, inode, path + depth); + return err; +} + +/* + * ext4_split_extents() splits an extent and mark extent which is covered + * by @map as split_flags indicates + * + * It may result in splitting the extent into multiple extents (upto three) + * There are three possibilities: + * a> There is no split required + * b> Splits in two extents: Split is happening at either end of the extent + * c> Splits in three extents: Somone is splitting in middle of the extent + * + */ +static int ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, + int flags) +{ + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len, depth; + int err = 0; + int uninitialized; + int split_flag1, flags1; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + uninitialized = ext4_ext_is_uninitialized(ex); + + if (map->m_lblk + map->m_len < ee_block + ee_len) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk + map->m_len, split_flag1, flags1); + if (err) + goto out; + } + + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, path); + if (IS_ERR(path)) + return PTR_ERR(path); + + if (map->m_lblk >= ee_block) { + split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? + EXT4_EXT_MAY_ZEROOUT : 0; + if (uninitialized) + split_flag1 |= EXT4_EXT_MARK_UNINIT1; + if (split_flag & EXT4_EXT_MARK_UNINIT2) + split_flag1 |= EXT4_EXT_MARK_UNINIT2; + err = ext4_split_extent_at(handle, inode, path, + map->m_lblk, split_flag1, flags); + if (err) + goto out; + } + + ext4_ext_show_leaf(inode, path); +out: + return err ? err : map->m_len; +} + +#define EXT4_EXT_ZERO_LEN 7 +/* + * This function is called by ext4_ext_map_blocks() if someone tries to write + * to an uninitialized extent. It may result in splitting the uninitialized + * extent into multiple extents (up to three - one initialized and two + * uninitialized). + * There are three possibilities: + * a> There is no split required: Entire extent should be initialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * Pre-conditions: + * - The extent pointed to by 'path' is uninitialized. + * - The extent pointed to by 'path' contains a superset + * of the logical span [map->m_lblk, map->m_lblk + map->m_len). + * + * Post-conditions on success: + * - the returned value is the number of blocks beyond map->l_lblk + * that are allocated and initialized. + * It is guaranteed to be >= map->m_len. + */ +static int ext4_ext_convert_to_initialized(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) +{ + struct ext4_extent_header *eh; + struct ext4_map_blocks split_map; + struct ext4_extent zero_ex; + struct ext4_extent *ex; + ext4_lblk_t ee_block, eof_block; + unsigned int ee_len, depth; + int allocated; + int err = 0; + int split_flag = 0; + + ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (map->m_lblk - ee_block); + + trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); + + /* Pre-conditions */ + BUG_ON(!ext4_ext_is_uninitialized(ex)); + BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); + + /* + * Attempt to transfer newly initialized blocks from the currently + * uninitialized extent to its left neighbor. This is much cheaper + * than an insertion followed by a merge as those involve costly + * memmove() calls. This is the common case in steady state for + * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append + * writes. + * + * Limitations of the current logic: + * - L1: we only deal with writes at the start of the extent. + * The approach could be extended to writes at the end + * of the extent but this scenario was deemed less common. + * - L2: we do not deal with writes covering the whole extent. + * This would require removing the extent if the transfer + * is possible. + * - L3: we only attempt to merge with an extent stored in the + * same extent tree node. + */ + if ((map->m_lblk == ee_block) && /*L1*/ + (map->m_len < ee_len) && /*L2*/ + (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ + struct ext4_extent *prev_ex; + ext4_lblk_t prev_lblk; + ext4_fsblk_t prev_pblk, ee_pblk; + unsigned int prev_len, write_len; + + prev_ex = ex - 1; + prev_lblk = le32_to_cpu(prev_ex->ee_block); + prev_len = ext4_ext_get_actual_len(prev_ex); + prev_pblk = ext4_ext_pblock(prev_ex); + ee_pblk = ext4_ext_pblock(ex); + write_len = map->m_len; + + /* + * A transfer of blocks from 'ex' to 'prev_ex' is allowed + * upon those conditions: + * - C1: prev_ex is initialized, + * - C2: prev_ex is logically abutting ex, + * - C3: prev_ex is physically abutting ex, + * - C4: prev_ex can receive the additional blocks without + * overflowing the (initialized) length limit. + */ + if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ + ((prev_lblk + prev_len) == ee_block) && /*C2*/ + ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ + (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + + trace_ext4_ext_convert_to_initialized_fastpath(inode, + map, ex, prev_ex); + + /* Shift the start of ex by 'write_len' blocks */ + ex->ee_block = cpu_to_le32(ee_block + write_len); + ext4_ext_store_pblock(ex, ee_pblk + write_len); + ex->ee_len = cpu_to_le16(ee_len - write_len); + ext4_ext_mark_uninitialized(ex); /* Restore the flag */ + + /* Extend prev_ex by 'write_len' blocks */ + prev_ex->ee_len = cpu_to_le16(prev_len + write_len); + + /* Mark the block containing both extents as dirty */ + ext4_ext_dirty(handle, inode, path + depth); + + /* Update path to point to the right extent */ + path[depth].p_ext = prev_ex; + + /* Result: number of initialized blocks past m_lblk */ + allocated = write_len; + goto out; + } + } + + WARN_ON(map->m_lblk < ee_block); + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + + /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ + if (ee_len <= 2*EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + err = ext4_ext_zeroout(inode, ex); + if (err) + goto out; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + ext4_ext_mark_initialized(ex); + ext4_ext_try_to_merge(inode, path, ex); + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + } + + /* + * four cases: + * 1. split the extent into three extents. + * 2. split the extent into two extents, zeroout the first half. + * 3. split the extent into two extents, zeroout the second half. + * 4. split the extent into two extents with out zeroout. + */ + split_map.m_lblk = map->m_lblk; + split_map.m_len = map->m_len; + + if (allocated > map->m_len) { + if (allocated <= EXT4_EXT_ZERO_LEN && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 3 */ + zero_ex.ee_block = + cpu_to_le32(map->m_lblk); + zero_ex.ee_len = cpu_to_le16(allocated); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex) + map->m_lblk - ee_block); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + split_map.m_lblk = map->m_lblk; + split_map.m_len = allocated; + } else if ((map->m_lblk - ee_block + map->m_len < + EXT4_EXT_ZERO_LEN) && + (EXT4_EXT_MAY_ZEROOUT & split_flag)) { + /* case 2 */ + if (map->m_lblk != ee_block) { + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = cpu_to_le16(map->m_lblk - + ee_block); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + err = ext4_ext_zeroout(inode, &zero_ex); + if (err) + goto out; + } + + split_map.m_lblk = ee_block; + split_map.m_len = map->m_lblk - ee_block + map->m_len; + allocated = map->m_len; + } + } + + allocated = ext4_split_extent(handle, inode, path, + &split_map, split_flag, 0); + if (allocated < 0) + err = allocated; + +out: + return err ? err : allocated; +} + +/* + * This function is called by ext4_ext_map_blocks() from + * ext4_get_blocks_dio_write() when DIO to write + * to an uninitialized extent. + * + * Writing to an uninitialized extent may result in splitting the uninitialized + * extent into multiple /initialized uninitialized extents (up to three) + * There are three possibilities: + * a> There is no split required: Entire extent should be uninitialized + * b> Splits in two extents: Write is happening at either end of the extent + * c> Splits in three extents: Somone is writing in middle of the extent + * + * One of more index blocks maybe needed if the extent tree grow after + * the uninitialized extent split. To prevent ENOSPC occur at the IO + * complete, we need to split the uninitialized extent before DIO submit + * the IO. The uninitialized extent called at this time will be split + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. + */ +static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, + int flags) +{ + ext4_lblk_t eof_block; + ext4_lblk_t ee_block; + struct ext4_extent *ex; + unsigned int ee_len; + int split_flag = 0, depth; + + ext_debug("ext4_split_unwritten_extents: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)map->m_lblk, map->m_len); + + eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits; + if (eof_block < map->m_lblk + map->m_len) + eof_block = map->m_lblk + map->m_len; + /* + * It is safe to convert extent to initialized via explicit + * zeroout only if extent is fully insde i_size or new_size. + */ + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; + split_flag |= EXT4_EXT_MARK_UNINIT2; + + flags |= EXT4_GET_BLOCKS_PRE_IO; + return ext4_split_extent(handle, inode, path, map, split_flag, flags); +} + +static int ext4_convert_unwritten_extents_endio(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_extent *ex; + int depth; + int err = 0; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" + "block %llu, max_blocks %u\n", inode->i_ino, + (unsigned long long)le32_to_cpu(ex->ee_block), + ext4_ext_get_actual_len(ex)); + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* first mark the extent as initialized */ + ext4_ext_mark_initialized(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + depth); +out: + ext4_ext_show_leaf(inode, path); + return err; +} + +static void unmap_underlying_metadata_blocks(struct block_device *bdev, + sector_t block, int count) +{ + int i; + for (i = 0; i < count; i++) + unmap_underlying_metadata(bdev, block + i); +} + +/* + * Handle EOFBLOCKS_FL flag, clearing it if necessary + */ +static int check_eofblocks_fl(handle_t *handle, struct inode *inode, + ext4_lblk_t lblk, + struct ext4_ext_path *path, + unsigned int len) +{ + int i, depth; + struct ext4_extent_header *eh; + struct ext4_extent *last_ex; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) + return 0; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; + + /* + * We're going to remove EOFBLOCKS_FL entirely in future so we + * do not care for this case anymore. Simply remove the flag + * if there are no extents. + */ + if (unlikely(!eh->eh_entries)) + goto out; + last_ex = EXT_LAST_EXTENT(eh); + /* + * We should clear the EOFBLOCKS_FL flag if we are writing the + * last block in the last extent in the file. We test this by + * first checking to see if the caller to + * ext4_ext_get_blocks() was interested in the last block (or + * a block beyond the last block) in the current extent. If + * this turns out to be false, we can bail out from this + * function immediately. + */ + if (lblk + len < le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + return 0; + /* + * If the caller does appear to be planning to write at or + * beyond the end of the current extent, we then test to see + * if the current extent is the last extent in the file, by + * checking to make sure it was reached via the rightmost node + * at each level of the tree. + */ + for (i = depth-1; i >= 0; i--) + if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) + return 0; +out: + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + return ext4_mark_inode_dirty(handle, inode); +} + +/** + * ext4_find_delalloc_range: find delayed allocated block in the given range. + * + * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns + * whether there are any buffers marked for delayed allocation. It returns '1' + * on the first delalloc'ed buffer head found. If no buffer head in the given + * range is marked for delalloc, it returns 0. + * lblk_start should always be <= lblk_end. + * search_hint_reverse is to indicate that searching in reverse from lblk_end to + * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed + * block sooner). This is useful when blocks are truncated sequentially from + * lblk_start towards lblk_end. + */ +static int ext4_find_delalloc_range(struct inode *inode, + ext4_lblk_t lblk_start, + ext4_lblk_t lblk_end, + int search_hint_reverse) +{ + struct address_space *mapping = inode->i_mapping; + struct buffer_head *head, *bh = NULL; + struct page *page; + ext4_lblk_t i, pg_lblk; + pgoff_t index; + + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; + + /* reverse search wont work if fs block size is less than page size */ + if (inode->i_blkbits < PAGE_CACHE_SHIFT) + search_hint_reverse = 0; + + if (search_hint_reverse) + i = lblk_end; + else + i = lblk_start; + + index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + + while ((i >= lblk_start) && (i <= lblk_end)) { + page = find_get_page(mapping, index); + if (!page) + goto nextpage; + + if (!page_has_buffers(page)) + goto nextpage; + + head = page_buffers(page); + if (!head) + goto nextpage; + + bh = head; + pg_lblk = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + do { + if (unlikely(pg_lblk < lblk_start)) { + /* + * This is possible when fs block size is less + * than page size and our cluster starts/ends in + * middle of the page. So we need to skip the + * initial few blocks till we reach the 'lblk' + */ + pg_lblk++; + continue; + } + + /* Check if the buffer is delayed allocated and that it + * is not yet mapped. (when da-buffers are mapped during + * their writeout, their da_mapped bit is set.) + */ + if (buffer_delay(bh) && !buffer_da_mapped(bh)) { + page_cache_release(page); + trace_ext4_find_delalloc_range(inode, + lblk_start, lblk_end, + search_hint_reverse, + 1, i); + return 1; + } + if (search_hint_reverse) + i--; + else + i++; + } while ((i >= lblk_start) && (i <= lblk_end) && + ((bh = bh->b_this_page) != head)); +nextpage: + if (page) + page_cache_release(page); + /* + * Move to next page. 'i' will be the first lblk in the next + * page. + */ + if (search_hint_reverse) + index--; + else + index++; + i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + } + + trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse, 0, 0); + return 0; +} + +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, + int search_hint_reverse) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t lblk_start, lblk_end; + lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_end = lblk_start + sbi->s_cluster_ratio - 1; + + return ext4_find_delalloc_range(inode, lblk_start, lblk_end, + search_hint_reverse); +} + +/** + * Determines how many complete clusters (out of those specified by the 'map') + * are under delalloc and were reserved quota for. + * This function is called when we are writing out the blocks that were + * originally written with their allocation delayed, but then the space was + * allocated using fallocate() before the delayed allocation could be resolved. + * The cases to look for are: + * ('=' indicated delayed allocated blocks + * '-' indicates non-delayed allocated blocks) + * (a) partial clusters towards beginning and/or end outside of allocated range + * are not delalloc'ed. + * Ex: + * |----c---=|====c====|====c====|===-c----| + * |++++++ allocated ++++++| + * ==> 4 complete clusters in above example + * + * (b) partial cluster (outside of allocated range) towards either end is + * marked for delayed allocation. In this case, we will exclude that + * cluster. + * Ex: + * |----====c========|========c========| + * |++++++ allocated ++++++| + * ==> 1 complete clusters in above example + * + * Ex: + * |================c================| + * |++++++ allocated ++++++| + * ==> 0 complete clusters in above example + * + * The ext4_da_update_reserve_space will be called only if we + * determine here that there were some "entire" clusters that span + * this 'allocated' range. + * In the non-bigalloc case, this function will just end up returning num_blks + * without ever calling ext4_find_delalloc_range. + */ +static unsigned int +get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, + unsigned int num_blks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t alloc_cluster_start, alloc_cluster_end; + ext4_lblk_t lblk_from, lblk_to, c_offset; + unsigned int allocated_clusters = 0; + + alloc_cluster_start = EXT4_B2C(sbi, lblk_start); + alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); + + /* max possible clusters for this allocation */ + allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; + + trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); + + /* Check towards left side */ + c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + if (c_offset) { + lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_to = lblk_from + c_offset - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + /* Now check towards right. */ + c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + if (allocated_clusters && c_offset) { + lblk_from = lblk_start + num_blks; + lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; + + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + allocated_clusters--; + } + + return allocated_clusters; +} + +static int +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) +{ + int ret = 0; + int err = 0; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " + "block %llu, max_blocks %u, flags %x, allocated %u\n", + inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, + flags, allocated); + ext4_ext_show_leaf(inode, path); + + trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, + newblock); + + /* get_block() before submit the IO, split the extent */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { + ret = ext4_split_unwritten_extents(handle, inode, map, + path, flags); + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to conversion to written when IO is + * completed + */ + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + if (ext4_should_dioread_nolock(inode)) + map->m_flags |= EXT4_MAP_UNINIT; + goto out; + } + /* IO end_io complete, convert the filled extent to written */ + if ((flags & EXT4_GET_BLOCKS_CONVERT)) { + ret = ext4_convert_unwritten_extents_endio(handle, inode, + path); + if (ret >= 0) { + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, map->m_len); + } else + err = ret; + goto out2; + } + /* buffered IO case */ + /* + * repeat fallocate creation request + * we already have an unwritten extent + */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) + goto map_out; + + /* buffered READ or buffered write_begin() lookup */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + map->m_flags |= EXT4_MAP_UNWRITTEN; + goto out1; + } + + /* buffered write, writepage time, convert*/ + ret = ext4_ext_convert_to_initialized(handle, inode, map, path); + if (ret >= 0) + ext4_update_inode_fsync_trans(handle, inode, 1); +out: + if (ret <= 0) { + err = ret; + goto out2; + } else + allocated = ret; + map->m_flags |= EXT4_MAP_NEW; + /* + * if we allocated more blocks than requested + * we need to make sure we unmap the extra block + * allocated. The actual needed block will get + * unmapped later when we find the buffer_head marked + * new. + */ + if (allocated > map->m_len) { + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + map->m_len, + allocated - map->m_len); + allocated = map->m_len; + } + + /* + * If we have done fallocate with the offset that is already + * delayed allocated, we would have block reservation + * and quota reservation done in the delayed write path. + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, map->m_len); + if (reserved_clusters) + ext4_da_update_reserve_space(inode, + reserved_clusters, + 0); + } + +map_out: + map->m_flags |= EXT4_MAP_MAPPED; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, + map->m_len); + if (err < 0) + goto out2; + } +out1: + if (allocated > map->m_len) + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); + map->m_pblk = newblock; + map->m_len = allocated; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return err ? err : allocated; +} + +/* + * get_implied_cluster_alloc - check to see if the requested + * allocation (in the map structure) overlaps with a cluster already + * allocated in an extent. + * @sb The filesystem superblock structure + * @map The requested lblk->pblk mapping + * @ex The extent structure which might contain an implied + * cluster allocation + * + * This function is called by ext4_ext_map_blocks() after we failed to + * find blocks that were already in the inode's extent tree. Hence, + * we know that the beginning of the requested region cannot overlap + * the extent from the inode's extent tree. There are three cases we + * want to catch. The first is this case: + * + * |--- cluster # N--| + * |--- extent ---| |---- requested region ---| + * |==========| + * + * The second case that we need to test for is this one: + * + * |--------- cluster # N ----------------| + * |--- requested region --| |------- extent ----| + * |=======================| + * + * The third case is when the requested region lies between two extents + * within the same cluster: + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + * + * In each of the above cases, we need to set the map->m_pblk and + * map->m_len so it corresponds to the return the extent labelled as + * "|====|" from cluster #N, since it is already in use for data in + * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to + * signal to ext4_ext_map_blocks() that map->m_pblk should be treated + * as a new "allocated" block region. Otherwise, we will return 0 and + * ext4_ext_map_blocks() will then allocate one or more new clusters + * by calling ext4_mb_new_blocks(). + */ +static int get_implied_cluster_alloc(struct super_block *sb, + struct ext4_map_blocks *map, + struct ext4_extent *ex, + struct ext4_ext_path *path) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t ex_cluster_start, ex_cluster_end; + ext4_lblk_t rr_cluster_start; + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); + + /* The extent passed in that we are trying to match */ + ex_cluster_start = EXT4_B2C(sbi, ee_block); + ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); + + /* The requested region passed into ext4_map_blocks() */ + rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); + + if ((rr_cluster_start == ex_cluster_end) || + (rr_cluster_start == ex_cluster_start)) { + if (rr_cluster_start == ex_cluster_end) + ee_start += ee_len - 1; + map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + + c_offset; + map->m_len = min(map->m_len, + (unsigned) sbi->s_cluster_ratio - c_offset); + /* + * Check for and handle this case: + * + * |--------- cluster # N-------------| + * |------- extent ----| + * |--- requested region ---| + * |===========| + */ + + if (map->m_lblk < ee_block) + map->m_len = min(map->m_len, ee_block - map->m_lblk); + + /* + * Check for the case where there is already another allocated + * block to the right of 'ex' but before the end of the cluster. + * + * |------------- cluster # N-------------| + * |----- ex -----| |---- ex_right ----| + * |------ requested region ------| + * |================| + */ + if (map->m_lblk > ee_block) { + ext4_lblk_t next = ext4_ext_next_allocated_block(path); + map->m_len = min(map->m_len, next - map->m_lblk); + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); + return 1; + } + + trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); + return 0; +} + + +/* + * Block allocation/map/preallocation routine for extents based files + * + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block + * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * + * return > 0, number of of blocks already mapped/allocated + * if create == 0 and these are pre-allocated blocks + * buffer head is unmapped + * otherwise blocks are mapped + * + * return = 0, if plain look up failed (blocks have not been allocated) + * buffer head is unmapped + * + * return < 0, error case. + */ +int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent newex, *ex, *ex2; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_fsblk_t newblock = 0; + int free_on_err = 0, err = 0, depth, ret; + unsigned int allocated = 0, offset = 0; + unsigned int allocated_clusters = 0; + struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + ext4_lblk_t cluster_offset; + + ext_debug("blocks %u/%u requested for inode %lu\n", + map->m_lblk, map->m_len, inode->i_ino); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + + /* check in cache */ + if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { + if (!newex.ee_start_lo && !newex.ee_start_hi) { + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * block isn't allocated yet and + * user doesn't want to allocate it + */ + goto out2; + } + /* we should allocate requested block */ + } else { + /* block is already allocated */ + if (sbi->s_cluster_ratio > 1) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + newblock = map->m_lblk + - le32_to_cpu(newex.ee_block) + + ext4_ext_pblock(&newex); + /* number of remaining blocks in the extent */ + allocated = ext4_ext_get_actual_len(&newex) - + (map->m_lblk - le32_to_cpu(newex.ee_block)); + goto out; + } + } + + /* find extent for this block */ + path = ext4_ext_find_extent(inode, map->m_lblk, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + goto out2; + } + + depth = ext_depth(inode); + + /* + * consistent leaf must not be empty; + * this situation is possible, though, _during_ tree modification; + * this is why assert can't be put in ext4_ext_find_extent() + */ + if (unlikely(path[depth].p_ext == NULL && depth != 0)) { + EXT4_ERROR_INODE(inode, "bad extent address " + "lblock: %lu, depth: %d pblock %lld", + (unsigned long) map->m_lblk, depth, + path[depth].p_block); + err = -EIO; + goto out2; + } + + ex = path[depth].p_ext; + if (ex) { + ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); + ext4_fsblk_t ee_start = ext4_ext_pblock(ex); + unsigned short ee_len; + + /* + * Uninitialized extents are treated as holes, except that + * we split out initialized portions during a write. + */ + ee_len = ext4_ext_get_actual_len(ex); + + trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); + + /* if found extent covers block, simply return it */ + if (in_range(map->m_lblk, ee_block, ee_len)) { + newblock = map->m_lblk - ee_block + ee_start; + /* number of remaining blocks in the extent */ + allocated = ee_len - (map->m_lblk - ee_block); + ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, + ee_block, ee_len, newblock); + + /* + * Do not put uninitialized extent + * in the cache + */ + if (!ext4_ext_is_uninitialized(ex)) { + ext4_ext_put_in_cache(inode, ee_block, + ee_len, ee_start); + goto out; + } + ret = ext4_ext_handle_uninitialized_extents( + handle, inode, map, path, flags, + allocated, newblock); + return ret; + } + } + + if ((sbi->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + + /* + * requested block isn't allocated yet; + * we couldn't try to create block if create flag is zero + */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + /* + * put just found gap into cache to speed up + * subsequent requests + */ + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + goto out2; + } + + /* + * Okay, we need to do block allocation. + */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + newex.ee_block = cpu_to_le32(map->m_lblk); + cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + + /* + * If we are doing bigalloc, check to see if the extent returned + * by ext4_ext_find_extent() implies a cluster we can use. + */ + if (cluster_offset && ex && + get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + goto got_allocated_blocks; + } + + /* find neighbour allocated blocks */ + ar.lleft = map->m_lblk; + err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); + if (err) + goto out2; + ar.lright = map->m_lblk; + ex2 = NULL; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + if (err) + goto out2; + + /* Check if the extent after searching to the right implies a + * cluster we can use. */ + if ((sbi->s_cluster_ratio > 1) && ex2 && + get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { + ar.len = allocated = map->m_len; + newblock = map->m_pblk; + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + goto got_allocated_blocks; + } + + /* + * See if request is beyond maximum number of blocks we can have in + * a single extent. For an initialized extent this limit is + * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is + * EXT_UNINIT_MAX_LEN. + */ + if (map->m_len > EXT_INIT_MAX_LEN && + !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + map->m_len = EXT_INIT_MAX_LEN; + else if (map->m_len > EXT_UNINIT_MAX_LEN && + (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) + map->m_len = EXT_UNINIT_MAX_LEN; + + /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ + newex.ee_len = cpu_to_le16(map->m_len); + err = ext4_ext_check_overlap(sbi, inode, &newex, path); + if (err) + allocated = ext4_ext_get_actual_len(&newex); + else + allocated = map->m_len; + + /* allocate new block */ + ar.inode = inode; + ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); + ar.logical = map->m_lblk; + /* + * We calculate the offset from the beginning of the cluster + * for the logical block number, since when we allocate a + * physical cluster, the physical block should start at the + * same offset from the beginning of the cluster. This is + * needed so that future calls to get_implied_cluster_alloc() + * work correctly. + */ + offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + ar.len = EXT4_NUM_B2C(sbi, offset+allocated); + ar.goal -= offset; + ar.logical -= offset; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) + ar.flags |= EXT4_MB_HINT_NOPREALLOC; + newblock = ext4_mb_new_blocks(handle, &ar, &err); + if (!newblock) + goto out2; + ext_debug("allocate new block: goal %llu, found %llu/%u\n", + ar.goal, newblock, allocated); + free_on_err = 1; + allocated_clusters = ar.len; + ar.len = EXT4_C2B(sbi, ar.len) - offset; + if (ar.len > allocated) + ar.len = allocated; + +got_allocated_blocks: + /* try to insert new extent into found leaf and return */ + ext4_ext_store_pblock(&newex, newblock + offset); + newex.ee_len = cpu_to_le16(ar.len); + /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ + ext4_ext_mark_uninitialized(&newex); + /* + * io_end structure was created for every IO write to an + * uninitialized extent. To avoid unnecessary conversion, + * here we flag the IO that really needs the conversion. + * For non asycn direct IO case, flag the inode state + * that we need to perform conversion when IO is done. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { + if (io) + ext4_set_io_unwritten_flag(inode, io); + else + ext4_set_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN); + } + if (ext4_should_dioread_nolock(inode)) + map->m_flags |= EXT4_MAP_UNINIT; + } + + err = 0; + if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) + err = check_eofblocks_fl(handle, inode, map->m_lblk, + path, ar.len); + if (!err) + err = ext4_ext_insert_extent(handle, inode, path, + &newex, flags); + if (err && free_on_err) { + int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? + EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ + ext4_discard_preallocations(inode); + ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), + ext4_ext_get_actual_len(&newex), fb_flags); + goto out2; + } + + /* previous routine could use block we allocated */ + newblock = ext4_ext_pblock(&newex); + allocated = ext4_ext_get_actual_len(&newex); + if (allocated > map->m_len) + allocated = map->m_len; + map->m_flags |= EXT4_MAP_NEW; + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + unsigned int reserved_clusters; + /* + * Check how many clusters we had reserved this allocated range + */ + reserved_clusters = get_reserved_cluster_alloc(inode, + map->m_lblk, allocated); + if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (reserved_clusters) { + /* + * We have clusters reserved for this range. + * But since we are not doing actual allocation + * and are simply using blocks from previously + * allocated cluster, we should release the + * reservation and not claim quota. + */ + ext4_da_update_reserve_space(inode, + reserved_clusters, 0); + } + } else { + BUG_ON(allocated_clusters < reserved_clusters); + /* We will claim quota for all newly allocated blocks.*/ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); + if (reserved_clusters < allocated_clusters) { + struct ext4_inode_info *ei = EXT4_I(inode); + int reservation = allocated_clusters - + reserved_clusters; + /* + * It seems we claimed few clusters outside of + * the range of this allocation. We should give + * it back to the reservation pool. This can + * happen in the following case: + * + * * Suppose s_cluster_ratio is 4 (i.e., each + * cluster has 4 blocks. Thus, the clusters + * are [0-3],[4-7],[8-11]... + * * First comes delayed allocation write for + * logical blocks 10 & 11. Since there were no + * previous delayed allocated blocks in the + * range [8-11], we would reserve 1 cluster + * for this write. + * * Next comes write for logical blocks 3 to 8. + * In this case, we will reserve 2 clusters + * (for [0-3] and [4-7]; and not for [8-11] as + * that range has a delayed allocated blocks. + * Thus total reserved clusters now becomes 3. + * * Now, during the delayed allocation writeout + * time, we will first write blocks [3-8] and + * allocate 3 clusters for writing these + * blocks. Also, we would claim all these + * three clusters above. + * * Now when we come here to writeout the + * blocks [10-11], we would expect to claim + * the reservation of 1 cluster we had made + * (and we would claim it since there are no + * more delayed allocated blocks in the range + * [8-11]. But our reserved cluster count had + * already gone to 0. + * + * Thus, at the step 4 above when we determine + * that there are still some unwritten delayed + * allocated blocks outside of our current + * block range, we should increment the + * reserved clusters count so that when the + * remaining blocks finally gets written, we + * could claim them. + */ + dquot_reserve_block(inode, + EXT4_C2B(sbi, reservation)); + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks += reservation; + spin_unlock(&ei->i_block_reservation_lock); + } + } + } + + /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { + ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); + ext4_update_inode_fsync_trans(handle, inode, 1); + } else + ext4_update_inode_fsync_trans(handle, inode, 0); +out: + if (allocated > map->m_len) + allocated = map->m_len; + ext4_ext_show_leaf(inode, path); + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + map->m_len = allocated; +out2: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : allocated); + + return err ? err : allocated; +} + +void ext4_ext_truncate(struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + ext4_lblk_t last_block; + handle_t *handle; + loff_t page_len; + int err = 0; + + /* + * finish any pending end_io work so we won't run the risk of + * converting any truncated blocks to initialized later + */ + ext4_flush_completed_IO(inode); + + /* + * probably first extent we're gonna free will be last in block + */ + err = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, err); + if (IS_ERR(handle)) + return; + + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out_stop; + } + + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + + ext4_discard_preallocations(inode); + + /* + * TODO: optimization is possible here. + * Probably we need not scan at all, + * because page truncation is enough. + */ + + /* we have to know where to truncate from in crash case */ + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + + last_block = (inode->i_size + sb->s_blocksize - 1) + >> EXT4_BLOCK_SIZE_BITS(sb); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous. + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out_stop: + /* + * If this was a simple ftruncate() and the file will remain alive, + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); +} + +static void ext4_falloc_update_inode(struct inode *inode, + int mode, loff_t new_size, int update_ctime) +{ + struct timespec now; + + if (update_ctime) { + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + } + /* + * Update only when preallocation was requested beyond + * the file size. + */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) + ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + } + +} + +/* + * preallocate space for a file. This implements ext4's fallocate file + * operation, which gets called from sys_fallocate system call. + * For block-mapped files, posix_fallocate should fall back to the method + * of writing zeroes to the required new blocks (the same behavior which is + * expected for file systems which do not support fallocate() system call). + */ +long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + handle_t *handle; + loff_t new_size; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + int retries = 0; + int flags; + struct ext4_map_blocks map; + unsigned int credits, blkbits = inode->i_blkbits; + + /* + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; + + /* Return error if mode is not supported */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return ext4_punch_hole(file, offset, len); + + trace_ext4_fallocate_enter(inode, offset, len, mode); + map.m_lblk = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) + - map.m_lblk; + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) { + mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); + return ret; + } + flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* + * Don't normalize the request if it can fit in one extent so + * that it doesn't get unnecessarily split into multiple + * extents. + */ + if (len <= EXT_UNINIT_MAX_LEN << blkbits) + flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; +retry: + while (ret >= 0 && ret < max_blocks) { + map.m_lblk = map.m_lblk + ret; + map.m_len = max_blocks = max_blocks - ret; + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret <= 0) { +#ifdef EXT4FS_DEBUG + WARN_ON(ret <= 0); + printk(KERN_ERR "%s: ext4_ext_map_blocks " + "returned error inode#%lu, block=%u, " + "max_blocks=%u", __func__, + inode->i_ino, map.m_lblk, max_blocks); +#endif + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + break; + } + if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, + blkbits) >> blkbits)) + new_size = offset + len; + else + new_size = ((loff_t) map.m_lblk + ret) << blkbits; + + ext4_falloc_update_inode(inode, mode, new_size, + (map.m_flags & EXT4_MAP_NEW)); + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret2) + break; + } + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; + goto retry; + } + mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, + ret > 0 ? ret2 : ret); + return ret > 0 ? ret2 : ret; +} + +/* + * This function convert a range of blocks to written extents + * The caller of this function will pass the start offset and the size. + * all unwritten extents within this range will be converted to + * written extents. + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + ssize_t len) +{ + handle_t *handle; + unsigned int max_blocks; + int ret = 0; + int ret2 = 0; + struct ext4_map_blocks map; + unsigned int credits, blkbits = inode->i_blkbits; + + map.m_lblk = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ + max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - + map.m_lblk); + /* + * credits to insert 1 extent into extent tree + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + break; + } + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_IO_CONVERT_EXT); + if (ret <= 0) { + WARN_ON(ret <= 0); + ext4_msg(inode->i_sb, KERN_ERR, + "%s:%d: inode #%lu: block %u: len %u: " + "ext4_ext_map_blocks returned %d", + __func__, __LINE__, inode->i_ino, map.m_lblk, + map.m_len, ret); + } + ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_journal_stop(handle); + if (ret <= 0 || ret2 ) + break; + } + return ret > 0 ? ret2 : ret; +} + +/* + * Callback function called for each extent to gather FIEMAP information. + */ +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int ret = 0; + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits; + + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; + + if (newex->ec_start == 0) { + /* + * No extent in extent-tree contains block @newex->ec_start, + * then the block may stay in 1)a hole or 2)delayed-extent. + * + * Holes or delayed-extents are processed as follows. + * 1. lookup dirty pages with specified range in pagecache. + * If no page is got, then there is no delayed-extent and + * return with EXT_CONTINUE. + * 2. find the 1st mapped buffer, + * 3. check if the mapped buffer is both in the request range + * and a delayed buffer. If not, there is no delayed-extent, + * then return. + * 4. a delayed-extent is found, the extent will be collected. + */ + ext4_lblk_t end = 0; + pgoff_t last_offset; + pgoff_t offset; + pgoff_t index; + pgoff_t start_index = 0; + struct page **pages = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *head = NULL; + unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); + + pages = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + offset = logical >> PAGE_SHIFT; +repeat: + last_offset = offset; + head = NULL; + ret = find_get_pages_tag(inode->i_mapping, &offset, + PAGECACHE_TAG_DIRTY, nr_pages, pages); + + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* First time, try to find a mapped buffer. */ + if (ret == 0) { +out: + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + /* just a hole. */ + kfree(pages); + return EXT_CONTINUE; + } + index = 0; + +next_page: + /* Try to find the 1st mapped buffer. */ + end = ((__u64)pages[index]->index << PAGE_SHIFT) >> + blksize_bits; + if (!page_has_buffers(pages[index])) + goto out; + head = page_buffers(pages[index]); + if (!head) + goto out; + + index++; + bh = head; + do { + if (end >= newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + + if (buffer_mapped(bh) && + end >= newex->ec_block) { + start_index = index - 1; + /* get the 1st mapped buffer. */ + goto found_mapped_buffer; + } + + bh = bh->b_this_page; + end++; + } while (bh != head); + + /* No mapped buffer in the range found in this page, + * We need to look up next page. + */ + if (index >= ret) { + /* There is no page left, but we need to limit + * newex->ec_len. + */ + newex->ec_len = end - newex->ec_block; + goto out; + } + goto next_page; + } else { + /*Find contiguous delayed buffers. */ + if (ret > 0 && pages[0]->index == last_offset) + head = page_buffers(pages[0]); + bh = head; + index = 1; + start_index = 0; + } + +found_mapped_buffer: + if (bh != NULL && buffer_delay(bh)) { + /* 1st or contiguous delayed buffer found. */ + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* + * 1st delayed buffer found, record + * the start of extent. + */ + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_block = end; + logical = (__u64)end << blksize_bits; + } + /* Find contiguous delayed buffers. */ + do { + if (!buffer_delay(bh)) + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + + for (; index < ret; index++) { + if (!page_has_buffers(pages[index])) { + bh = NULL; + break; + } + head = page_buffers(pages[index]); + if (!head) { + bh = NULL; + break; + } + + if (pages[index]->index != + pages[start_index]->index + index + - start_index) { + /* Blocks are not contiguous. */ + bh = NULL; + break; + } + bh = head; + do { + if (!buffer_delay(bh)) + /* Delayed-extent ends. */ + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + } + } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) + /* a hole found. */ + goto out; + +found_delayed_extent: + newex->ec_len = min(end - newex->ec_block, + (ext4_lblk_t)EXT_INIT_MAX_LEN); + if (ret == nr_pages && bh != NULL && + newex->ec_len < EXT_INIT_MAX_LEN && + buffer_delay(bh)) { + /* Have not collected an extent and continue. */ + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + goto repeat; + } + + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + kfree(pages); + } + + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + if (next == EXT_MAX_BLOCKS) + flags |= FIEMAP_EXTENT_LAST; + + ret = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (ret < 0) + return ret; + if (ret == 1) + return EXT_BREAK; + return EXT_CONTINUE; +} +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + brelse(iloc.bh); + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +/* + * ext4_ext_punch_hole + * + * Punches a hole of "length" bytes in a file starting + * at byte "offset" + * + * @inode: The inode of the file to punch a hole in + * @offset: The starting byte offset of the hole + * @length: The length of the hole + * + * Returns the number of blocks removed or negative on err + */ +int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + ext4_lblk_t first_block, stop_block; + struct address_space *mapping = inode->i_mapping; + handle_t *handle; + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + int credits, err = 0; + + /* No need to punch hole beyond i_size */ + if (offset >= inode->i_size) + return 0; + + /* + * If the hole extends beyond i_size, set the hole + * to end after the page that contains i_size + */ + if (offset + length > inode->i_size) { + length = inode->i_size + + PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - + offset; + } + + first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + last_page = (offset + length) >> PAGE_CACHE_SHIFT; + + first_page_offset = first_page << PAGE_CACHE_SHIFT; + last_page_offset = last_page << PAGE_CACHE_SHIFT; + + /* + * Write out all dirty pages to avoid race conditions + * Then release them. + */ + if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { + err = filemap_write_and_wait_range(mapping, + offset, offset + length - 1); + + if (err) + return err; + } + + /* Now release the pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(mapping, first_page_offset, + last_page_offset-1); + } + + /* finish any pending end_io work */ + ext4_flush_completed_IO(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_orphan_add(handle, inode); + if (err) + goto out; + + /* + * Now we need to zero out the non-page-aligned data in the + * pages at the start and tail of the hole, and unmap the buffer + * heads for the block aligned regions of the page that were + * completely zeroed. + */ + if (first_page > last_page) { + /* + * If the file space being truncated is contained within a page + * just zero out and unmap the middle of that page + */ + err = ext4_discard_partial_page_buffers(handle, + mapping, offset, length, 0); + + if (err) + goto out; + } else { + /* + * zero out and unmap the partial page that contains + * the start of the hole + */ + page_len = first_page_offset - offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + offset, page_len, 0); + if (err) + goto out; + } + + /* + * zero out and unmap the partial page that contains + * the end of the hole + */ + page_len = offset + length - last_page_offset; + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, mapping, + last_page_offset, page_len, 0); + if (err) + goto out; + } + } + + /* + * If i_size is contained in the last page, we need to + * unmap and zero the partial page after i_size + */ + if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && + inode->i_size % PAGE_CACHE_SIZE != 0) { + + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + if (page_len > 0) { + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out; + } + } + + first_block = (offset + sb->s_blocksize - 1) >> + EXT4_BLOCK_SIZE_BITS(sb); + stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* If there are no blocks to remove, return now */ + if (first_block >= stop_block) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + + err = ext4_ext_remove_space(inode, first_block, stop_block - 1); + + ext4_ext_invalidate_cache(inode); + ext4_discard_preallocations(inode); + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + up_write(&EXT4_I(inode)->i_data_sem); + +out: + ext4_orphan_del(handle, inode); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + return err; +} +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + int error = 0; + + /* fallback to generic here if not in extents fmt */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + ext4_lblk_t len_blks; + __u64 last_blk; + + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); + } + + return error; +} diff --git a/fs/ext4/file.c b/fs/ext4/file.c new file mode 100644 index 00000000..cb70f181 --- /dev/null +++ b/fs/ext4/file.c @@ -0,0 +1,262 @@ +/* + * linux/fs/ext4/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +/* + * Called when an inode is released. Note that this is different + * from ext4_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ +static int ext4_release_file(struct inode *inode, struct file *filp) +{ + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { + ext4_alloc_da_blocks(inode); + ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) + { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + } + if (is_dx(inode) && filp->private_data) + ext4_htree_free_dir_info(filp->private_data); + + return 0; +} + +static void ext4_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); +} + +/* + * This tests whether the IO in question is block-aligned or not. + * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they + * are converted to written only after the IO is complete. Until they are + * mapped, these blocks appear as holes, so dio_zero_block() will assume that + * it needs to zero out portions of the start and/or end block. If 2 AIO + * threads are at work on the same unwritten block, they must be synchronized + * or one thread will zero the other's data, causing corruption. + */ +static int +ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct super_block *sb = inode->i_sb; + int blockmask = sb->s_blocksize - 1; + size_t count = iov_length(iov, nr_segs); + loff_t final_size = pos + count; + + if (pos >= inode->i_size) + return 0; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + + return 0; +} + +static ssize_t +ext4_file_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int unaligned_aio = 0; + int ret; + + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + size_t length = iov_length(iov, nr_segs); + + if ((pos > sbi->s_bitmap_maxbytes || + (pos == sbi->s_bitmap_maxbytes && length > 0))) + return -EFBIG; + + if (pos + length > sbi->s_bitmap_maxbytes) { + nr_segs = iov_shorten((struct iovec *)iov, nr_segs, + sbi->s_bitmap_maxbytes - pos); + } + } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && + !is_sync_kiocb(iocb))) { + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); + } + + /* Unaligned direct AIO must be serialized; see comment above */ + if (unaligned_aio) { + static unsigned long unaligned_warn_time; + + /* Warn about this once per day */ + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) + ext4_msg(inode->i_sb, KERN_WARNING, + "Unaligned AIO/DIO on inode %ld by %s; " + "performance will be poor.", + inode->i_ino, current->comm); + mutex_lock(ext4_aio_mutex(inode)); + ext4_aiodio_wait(inode); + } + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + + if (unaligned_aio) + mutex_unlock(ext4_aio_mutex(inode)); + + return ret; +} + +static const struct vm_operations_struct ext4_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ext4_page_mkwrite, +}; + +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &ext4_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + struct vfsmount *mnt = filp->f_path.mnt; + struct path path; + char buf[64], *cp; + + if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && + !(sb->s_flags & MS_RDONLY))) { + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + if (!IS_ERR(cp)) { + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_mark_super_dirty(sb); + } + } + /* + * Set up the jbd2_inode if we are opening the inode for + * writing and the journal is present + */ + if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { + struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); + + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + } + return dquot_file_open(inode, filp); +} + +/* + * ext4_llseek() copied from generic_file_llseek() to handle both + * block-mapped and extent-mapped maxbytes values. This should + * otherwise be identical with generic_file_llseek(). + */ +loff_t ext4_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + else + maxbytes = inode->i_sb->s_maxbytes; + + return generic_file_llseek_size(file, offset, origin, maxbytes); +} + +const struct file_operations ext4_file_operations = { + .llseek = ext4_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ext4_file_write, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .fallocate = ext4_fallocate, +}; + +const struct inode_operations ext4_file_inode_operations = { + .setattr = ext4_setattr, + .getattr = ext4_getattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .get_acl = ext4_get_acl, + .fiemap = ext4_fiemap, +}; + diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c new file mode 100644 index 00000000..bb6c7d81 --- /dev/null +++ b/fs/ext4/fsync.c @@ -0,0 +1,271 @@ +/* + * linux/fs/ext4/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" + +#include + +static void dump_completed_IO(struct inode * inode) +{ +#ifdef EXT4FS_DEBUG + struct list_head *cur, *before, *after; + ext4_io_end_t *io, *io0, *io1; + unsigned long flags; + + if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ + ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); + return; + } + + ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ + cur = &io->list; + before = cur->prev; + io0 = container_of(before, ext4_io_end_t, list); + after = cur->next; + io1 = container_of(after, ext4_io_end_t, list); + + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + io, inode->i_ino, io0, io1); + } + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); +#endif +} + +/* + * This function is called from ext4_sync_file(). + * + * When IO is completed, the work to convert unwritten extents to + * written is queued on workqueue but may not get immediately + * scheduled. When fsync is called, we need to ensure the + * conversion is complete before fsync returns. + * The inode keeps track of a list of pending/completed IO that + * might needs to do the conversion. This function walks through + * the list and convert the related unwritten extents for completed IO + * to written. + * The function return the number of pending IOs on success. + */ +int ext4_flush_completed_IO(struct inode *inode) +{ + ext4_io_end_t *io; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + int ret = 0; + int ret2 = 0; + + dump_completed_IO(inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + while (!list_empty(&ei->i_completed_io_list)){ + io = list_entry(ei->i_completed_io_list.next, + ext4_io_end_t, list); + list_del_init(&io->list); + io->flag |= EXT4_IO_END_IN_FSYNC; + /* + * Calling ext4_end_io_nolock() to convert completed + * IO to written. + * + * When ext4_sync_file() is called, run_queue() may already + * about to flush the work corresponding to this io structure. + * It will be upset if it founds the io structure related + * to the work-to-be schedule is freed. + * + * Thus we need to keep the io structure still valid here after + * conversion finished. The io structure has a flag to + * avoid double converting from both fsync and background work + * queue work. + */ + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + ret = ext4_end_io_nolock(io); + if (ret < 0) + ret2 = ret; + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + io->flag &= ~EXT4_IO_END_IN_FSYNC; + } + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + return (ret2 < 0) ? ret2 : 0; +} + +/* + * If we're not journaling and this is a just-created file, we have to + * sync our parent directory (if it was freshly created) since + * otherwise it will only be written by writeback, leaving a huge + * window during which a crash may lose the file. This may apply for + * the parent directory's parent as well, and so on recursively, if + * they are also freshly created. + */ +static int ext4_sync_parent(struct inode *inode) +{ + struct writeback_control wbc; + struct dentry *dentry = NULL; + struct inode *next; + int ret = 0; + + if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + return 0; + inode = igrab(inode); + while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { + ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); + dentry = NULL; + spin_lock(&inode->i_lock); + if (!list_empty(&inode->i_dentry)) { + dentry = list_first_entry(&inode->i_dentry, + struct dentry, d_alias); + dget(dentry); + } + spin_unlock(&inode->i_lock); + if (!dentry) + break; + next = igrab(dentry->d_parent->d_inode); + dput(dentry); + if (!next) + break; + iput(inode); + inode = next; + ret = sync_mapping_buffers(inode->i_mapping); + if (ret) + break; + memset(&wbc, 0, sizeof(wbc)); + wbc.sync_mode = WB_SYNC_ALL; + wbc.nr_to_write = 0; /* only write out the inode */ + ret = sync_inode(inode, &wbc); + if (ret) + break; + } + iput(inode); + return ret; +} + +/** + * __sync_file - generic_file_fsync without the locking and filemap_write + * @inode: inode to sync + * @datasync: only sync essential metadata if true + * + * This is just generic_file_fsync without the locking. This is needed for + * nojournal mode to make sure this inodes data/metadata makes it to disk + * properly. The i_mutex should be held already. + */ +static int __sync_inode(struct inode *inode, int datasync) +{ + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (ret == 0) + ret = err; + return ret; +} + +/* + * akpm: A new design for ext4_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + * + * i_mutex lock is held when entering and exiting this function + */ + +int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + int ret; + tid_t commit_tid; + bool needs_barrier = false; + + J_ASSERT(ext4_journal_current_handle() == NULL); + + trace_ext4_sync_file_enter(file, datasync); + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + + if (inode->i_sb->s_flags & MS_RDONLY) + goto out; + + ret = ext4_flush_completed_IO(inode); + if (ret < 0) + goto out; + + if (!journal) { + ret = __sync_inode(inode, datasync); + if (!ret && !list_empty(&inode->i_dentry)) + ret = ext4_sync_parent(inode); + goto out; + } + + /* + * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. + * Metadata is in the journal, we wait for proper transaction to + * commit here. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). + * ext4_force_commit will write the file data into the journal and + * will wait on that. + * filemap_fdatawait() will encounter a ton of newly-dirtied pages + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } + + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + if (journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + jbd2_log_start_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); + if (needs_barrier) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + out: + mutex_unlock(&inode->i_mutex); + trace_ext4_sync_file_exit(inode, ret); + return ret; +} diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c new file mode 100644 index 00000000..fa8e4911 --- /dev/null +++ b/fs/ext4/hash.c @@ -0,0 +1,208 @@ +/* + * linux/fs/ext4/hash.c + * + * Copyright (C) 2002 by Theodore Ts'o + * + * This file is released under the GPL v2. + * + * This file may be redistributed under the terms of the GNU Public + * License. + */ + +#include +#include +#include +#include "ext4.h" + +#define DELTA 0x9E3779B9 + +static void TEA_transform(__u32 buf[4], __u32 const in[]) +{ + __u32 sum = 0; + __u32 b0 = buf[0], b1 = buf[1]; + __u32 a = in[0], b = in[1], c = in[2], d = in[3]; + int n = 16; + + do { + sum += DELTA; + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); + } while (--n); + + buf[0] += b0; + buf[1] += b1; +} + + +/* The old legacy hash */ +static __u32 dx_hack_hash_unsigned(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) ucp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +/* + * Returns the hash of a filename. If len is 0 and name is NULL, then + * this function can be used to test whether or not a hash version is + * supported. + * + * The seed is an 4 longword (32 bits) "secret" which can be used to + * uniquify a hash. If the seed is all zero's, then some default seed + * may be used. + * + * A particular hash version specifies whether or not the seed is + * represented, and whether or not the returned hash is 32 bits or 64 + * bits. 32 bit hashes will return 0 for the minor hash. + */ +int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) +{ + __u32 hash; + __u32 minor_hash = 0; + const char *p; + int i; + __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; + buf[1] = 0xefcdab89; + buf[2] = 0x98badcfe; + buf[3] = 0x10325476; + + /* Check to see if the seed is all zero's */ + if (hinfo->seed) { + for (i = 0; i < 4; i++) { + if (hinfo->seed[i]) + break; + } + if (i < 4) + memcpy(buf, hinfo->seed, sizeof(buf)); + } + + switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; + case DX_HASH_LEGACY: + hash = dx_hack_hash_signed(name, len); + break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 8); + half_md4_transform(buf, in); + len -= 32; + p += 32; + } + minor_hash = buf[2]; + hash = buf[1]; + break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; + case DX_HASH_TEA: + p = name; + while (len > 0) { + (*str2hashbuf)(p, len, in, 4); + TEA_transform(buf, in); + len -= 16; + p += 16; + } + hash = buf[0]; + minor_hash = buf[1]; + break; + default: + hinfo->hash = 0; + return -1; + } + hash = hash & ~1; + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; + hinfo->hash = hash; + hinfo->minor_hash = minor_hash; + return 0; +} diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c new file mode 100644 index 00000000..b4a7dd56 --- /dev/null +++ b/fs/ext4/ialloc.c @@ -0,0 +1,1161 @@ +/* + * linux/fs/ext4/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" + +#include + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. + */ + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext4_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* Initializes an uninitialized inode bitmap */ +static unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t block_group, + struct ext4_group_desc *gdp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks and inodes use to prevent + * allocation, essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); + ext4_free_group_clusters_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); + memset(bh->b_data, 0xff, sb->s_blocksize); + return 0; + } + + memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, + bh->b_data); + + return EXT4_INODES_PER_GROUP(sb); +} + +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head * +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +{ + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; + + desc = ext4_get_group_desc(sb, block_group, NULL); + if (!desc) + return NULL; + + bitmap_blk = ext4_inode_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } + + ext4_lock_group(sb, block_group); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + return bh; + } + ext4_unlock_group(sb, block_group); + + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for reading + */ + trace_ext4_load_inode_bitmap(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + put_bh(bh); + ext4_error(sb, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; + } + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext4_free_inode(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bh2; + ext4_group_t block_group; + unsigned long bit; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; + struct ext4_sb_info *sbi; + int fatal = 0, err, count, cleared; + + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); + return; + } + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); + return; + } + sbi = EXT4_SB(sb); + + ino = inode->i_ino; + ext4_debug("freeing inode %lu\n", ino); + trace_ext4_free_inode(inode); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + dquot_initialize(inode); + ext4_xattr_delete_inode(handle, inode); + dquot_free_inode(inode); + dquot_drop(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + ext4_clear_inode(inode); + + es = EXT4_SB(sb)->s_es; + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext4_error(sb, "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) + goto error_return; + + BUFFER_TRACE(bitmap_bh, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bitmap_bh); + if (fatal) + goto error_return; + + fatal = -ESRCH; + gdp = ext4_get_group_desc(sb, block_group, &bh2); + if (gdp) { + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bh2); + } + ext4_lock_group(sb, block_group); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); + if (fatal || !cleared) { + ext4_unlock_group(sb, block_group); + goto out; + } + + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + percpu_counter_dec(&sbi->s_dirs_counter); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_unlock_group(sb, block_group); + + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, block_group); + + atomic_inc(&sbi->s_flex_groups[f].free_inodes); + if (is_directory) + atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +out: + if (cleared) { + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!fatal) + fatal = err; + ext4_mark_super_dirty(sb); + } else + ext4_error(sb, "bit already cleared for inode %lu", ino); + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); +} + +struct orlov_stats { + __u32 free_inodes; + __u32 free_clusters; + __u32 used_dirs; +}; + +/* + * Helper function for Orlov's allocator; returns critical information + * for a particular block group or flex_bg. If flex_size is 1, then g + * is a block group number; otherwise it is flex_bg number. + */ +static void get_orlov_stats(struct super_block *sb, ext4_group_t g, + int flex_size, struct orlov_stats *stats) +{ + struct ext4_group_desc *desc; + struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; + + if (flex_size > 1) { + stats->free_inodes = atomic_read(&flex_group[g].free_inodes); + stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->used_dirs = atomic_read(&flex_group[g].used_dirs); + return; + } + + desc = ext4_get_group_desc(sb, g, NULL); + if (desc) { + stats->free_inodes = ext4_free_inodes_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); + stats->used_dirs = ext4_used_dirs_count(sb, desc); + } else { + stats->free_inodes = 0; + stats->free_clusters = 0; + stats->used_dirs = 0; + } +} + +/* + * Orlov's allocator for directories. + * + * We always try to spread first-level directories. + * + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * Parent's group is preferred, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + */ + +static int find_group_orlov(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode, + const struct qstr *qstr) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t real_ngroups = ext4_get_groups_count(sb); + int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int freei, avefreei, grp_free; + ext4_fsblk_t freeb, avefreec; + unsigned int ndirs; + int max_dirs, min_inodes; + ext4_grpblk_t min_clusters; + ext4_group_t i, grp, g, ngroups; + struct ext4_group_desc *desc; + struct orlov_stats stats; + int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; + + ngroups = real_ngroups; + if (flex_size > 1) { + ngroups = (real_ngroups + flex_size - 1) >> + sbi->s_log_groups_per_flex; + parent_group >>= sbi->s_log_groups_per_flex; + } + + freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + avefreei = freei / ngroups; + freeb = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + avefreec = freeb; + do_div(avefreec, ngroups); + ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + + if (S_ISDIR(mode) && + ((parent == sb->s_root->d_inode) || + (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { + int best_ndir = inodes_per_group; + int ret = -1; + + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(qstr->name, qstr->len, &hinfo); + grp = hinfo.hash; + } else + get_random_bytes(&grp, sizeof(grp)); + parent_group = (unsigned)grp % ngroups; + for (i = 0; i < ngroups; i++) { + g = (parent_group + i) % ngroups; + get_orlov_stats(sb, g, flex_size, &stats); + if (!stats.free_inodes) + continue; + if (stats.used_dirs >= best_ndir) + continue; + if (stats.free_inodes < avefreei) + continue; + if (stats.free_clusters < avefreec) + continue; + grp = g; + ret = 0; + best_ndir = stats.used_dirs; + } + if (ret) + goto fallback; + found_flex_bg: + if (flex_size == 1) { + *group = grp; + return 0; + } + + /* + * We pack inodes at the beginning of the flexgroup's + * inode tables. Block allocation decisions will do + * something similar, although regular files will + * start at 2nd block group of the flexgroup. See + * ext4_ext_find_goal() and ext4_find_near(). + */ + grp *= flex_size; + for (i = 0; i < flex_size; i++) { + if (grp+i >= real_ngroups) + break; + desc = ext4_get_group_desc(sb, grp+i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = grp+i; + return 0; + } + } + goto fallback; + } + + max_dirs = ndirs / ngroups + inodes_per_group / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; + + /* + * Start looking in the flex group where we last allocated an + * inode for this parent directory + */ + if (EXT4_I(parent)->i_last_alloc_group != ~0) { + parent_group = EXT4_I(parent)->i_last_alloc_group; + if (flex_size > 1) + parent_group >>= sbi->s_log_groups_per_flex; + } + + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + get_orlov_stats(sb, grp, flex_size, &stats); + if (stats.used_dirs >= max_dirs) + continue; + if (stats.free_inodes < min_inodes) + continue; + if (stats.free_clusters < min_clusters) + continue; + goto found_flex_bg; + } + +fallback: + ngroups = real_ngroups; + avefreei = freei / ngroups; +fallback_retry: + parent_group = EXT4_I(parent)->i_block_group; + for (i = 0; i < ngroups; i++) { + grp = (parent_group + i) % ngroups; + desc = ext4_get_group_desc(sb, grp, NULL); + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } + } + } + + if (avefreei) { + /* + * The free-inodes counter is approximate, and for really small + * filesystems the above test can fail to find any blockgroups + */ + avefreei = 0; + goto fallback_retry; + } + + return -1; +} + +static int find_group_other(struct super_block *sb, struct inode *parent, + ext4_group_t *group, umode_t mode) +{ + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); + + /* + * Try to place the inode is the same flex group as its + * parent. If we can't find space, use the Orlov algorithm to + * find another flex group, and store that information in the + * parent directory's inode information so that use that flex + * group for future allocations. + */ + if (flex_size > 1) { + int retry = 0; + + try_again: + parent_group &= ~(flex_size-1); + last = parent_group + flex_size; + if (last > ngroups) + last = ngroups; + for (i = parent_group; i < last; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) { + *group = i; + return 0; + } + } + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { + retry = 1; + parent_group = EXT4_I(parent)->i_last_alloc_group; + goto try_again; + } + /* + * If this didn't work, use the Orlov search algorithm + * to find a new flex group; we pass in the mode to + * avoid the topdir algorithms. + */ + *group = parent_group + flex_size; + if (*group > ngroups) + *group = 0; + return find_group_orlov(sb, parent, group, mode, NULL); + } + + /* + * Try to place the inode in its parent directory + */ + *group = parent_group; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + + /* + * We're going to place this inode in a different blockgroup from its + * parent. We want to cause files in a common directory to all land in + * the same blockgroup. But we want files which are in a different + * directory which shares a blockgroup with our parent to land in a + * different blockgroup. + * + * So add our directory's i_ino into the starting point for the hash. + */ + *group = (*group + parent->i_ino) % ngroups; + + /* + * Use a quadratic hash to find a group with a free inode and some free + * blocks. + */ + for (i = 1; i < ngroups; i <<= 1) { + *group += i; + if (*group >= ngroups) + *group -= ngroups; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_group_clusters(sb, desc)) + return 0; + } + + /* + * That failed: try linear search for a free inode, even if that group + * has no free blocks. + */ + *group = parent_group; + for (i = 0; i < ngroups; i++) { + if (++*group >= ngroups) + *group = 0; + desc = ext4_get_group_desc(sb, *group, NULL); + if (desc && ext4_free_inodes_count(sb, desc)) + return 0; + } + + return -1; +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, + const struct qstr *qstr, __u32 goal, uid_t *owner) +{ + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; + ext4_group_t ngroups, group = 0; + unsigned long ino = 0; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_inode_info *ei; + struct ext4_sb_info *sbi; + int ret2, err = 0; + struct inode *ret; + ext4_group_t i; + ext4_group_t flex_group; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); + trace_ext4_request_inode(dir, mode); + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT4_I(inode); + sbi = EXT4_SB(sb); + + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else + ret2 = find_group_other(sb, dir, &group, mode); + +got_group: + EXT4_I(dir)->i_last_alloc_group = group; + err = -ENOSPC; + if (ret2 == -1) + goto out; + + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ + for (i = 0; i < ngroups; i++, ino = 0) { + err = -EIO; + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto fail; + + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!inode_bitmap_bh) + goto fail; + +repeat_in_this_group: + ino = ext4_find_next_zero_bit((unsigned long *) + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + if (ino >= EXT4_INODES_PER_GROUP(sb)) { + if (++group == ngroups) + group = 0; + continue; + } + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; + } + err = -ENOSPC; + goto out; + +got: + /* We may have to initialize the block bitmap if it isn't already */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + struct buffer_head *block_bitmap_bh; + + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); + if (err) { + brelse(block_bitmap_bh); + goto fail; + } + + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + brelse(block_bitmap_bh); + + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, + gdp); + } + ext4_unlock_group(sb, group); + + if (err) + goto fail; + } + + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) + goto fail; + + /* Update the relevant bg descriptor fields */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); + } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + } + ext4_unlock_group(sb, group); + + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) + goto fail; + + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) + goto fail; + + percpu_counter_dec(&sbi->s_freeinodes_counter); + if (S_ISDIR(mode)) + percpu_counter_inc(&sbi->s_dirs_counter); + ext4_mark_super_dirty(sb); + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); + } + if (owner) { + inode->i_mode = mode; + inode->i_uid = owner[0]; + inode->i_gid = owner[1]; + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + ext4_current_time(inode); + + memset(ei->i_data, 0, sizeof(ei->i_data)); + ei->i_dir_start_lookup = 0; + ei->i_disksize = 0; + + /* Don't inherit extent flag from directory, amongst others. */ + ei->i_flags = + ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); + ei->i_file_acl = 0; + ei->i_dtime = 0; + ei->i_block_group = group; + ei->i_last_alloc_group = ~0; + + ext4_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) + ext4_handle_sync(handle); + if (insert_inode_locked(inode) < 0) { + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + goto fail; + } + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ext4_set_inode_state(inode, EXT4_STATE_NEW); + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + + ret = inode; + dquot_initialize(inode); + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + + err = ext4_init_acl(handle, inode, dir); + if (err) + goto fail_free_drop; + + err = ext4_init_security(handle, inode, dir, qstr); + if (err) + goto fail_free_drop; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + ei->i_datasync_tid = handle->h_transaction->t_tid; + } + + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + + ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_ext4_allocate_inode(inode, dir, mode); + goto really_out; +fail: + ext4_std_error(sb, err); +out: + iput(inode); + ret = ERR_PTR(err); +really_out: + brelse(inode_bitmap_bh); + return ret; + +fail_free_drop: + dquot_free_inode(inode); + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + brelse(inode_bitmap_bh); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) +{ + unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); + ext4_group_t block_group; + int bit; + struct buffer_head *bitmap_bh; + struct inode *inode = NULL; + long err = -EIO; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); + goto error; + } + + block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); + if (!bitmap_bh) { + ext4_warning(sb, "inode bitmap error for orphan %lu", ino); + goto error; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext4_test_bit(bit, bitmap_bh->b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + /* + * If the orphans has i_nlinks > 0 then it should be able to be + * truncated, otherwise it won't be removed from the orphan list + * during processing and an infinite loop will result. + */ + if (inode->i_nlink && !ext4_can_truncate(inode)) + goto bad_orphan; + + if (NEXT_ORPHAN(inode) > max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); + printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%lu\n", max_ino); + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + } + brelse(bitmap_bh); +error: + return ERR_PTR(err); +} + +unsigned long ext4_count_free_inodes(struct super_block *sb) +{ + unsigned long desc_count; + struct ext4_group_desc *gdp; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); +#ifdef EXT4FS_DEBUG + struct ext4_super_block *es; + unsigned long bitmap_count, x; + struct buffer_head *bitmap_bh = NULL; + + es = EXT4_SB(sb)->s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + brelse(bitmap_bh); + bitmap_bh = ext4_read_inode_bitmap(sb, i); + if (!bitmap_bh) + continue; + + x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); + bitmap_count += x; + } + brelse(bitmap_bh); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + return desc_count; +#else + desc_count = 0; + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + desc_count += ext4_free_inodes_count(sb, gdp); + cond_resched(); + } + return desc_count; +#endif +} + +/* Called at mount-time, super-block is locked */ +unsigned long ext4_count_dirs(struct super_block * sb) +{ + unsigned long count = 0; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + if (!gdp) + continue; + count += ext4_used_dirs_count(sb, gdp); + } + return count; +} + +/* + * Zeroes not yet zeroed inode table - just write zeroes through the whole + * inode table. Must be called without any spinlock held. The only place + * where it is called from on active part of filesystem is ext4lazyinit + * thread, so we do not need any special locks, however we have to prevent + * inode allocation from the current group, so we take alloc_sem lock, to + * block ext4_new_inode() until we are finished. + */ +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, + int barrier) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + struct buffer_head *group_desc_bh; + handle_t *handle; + ext4_fsblk_t blk; + int num, ret = 0, used_blks = 0; + + /* This should not happen, but just to be sure check this */ + if (sb->s_flags & MS_RDONLY) { + ret = 1; + goto out; + } + + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); + if (!gdp) + goto out; + + /* + * We do not need to lock this, because we are the only one + * handling this flag. + */ + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) + goto out; + + handle = ext4_journal_start_sb(sb, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + down_write(&grp->alloc_sem); + /* + * If inode bitmap was already initialized there may be some + * used inodes so we need to skip blocks with used inodes in + * inode table. + */ + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp)), + sbi->s_inodes_per_block); + + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + blk = ext4_inode_table(sb, gdp) + used_blks; + num = sbi->s_itb_per_group - used_blks; + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + ret = ext4_journal_get_write_access(handle, + group_desc_bh); + if (ret) + goto err_out; + + /* + * Skip zeroout if the inode table is full. But we set the ZEROED + * flag anyway, because obviously, when it is full it does not need + * further zeroing. + */ + if (unlikely(num == 0)) + goto skip_zeroout; + + ext4_debug("going to zero out inode table in group %d\n", + group); + ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); + if (ret < 0) + goto err_out; + if (barrier) + blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + +skip_zeroout: + ext4_lock_group(sb, group); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_unlock_group(sb, group); + + BUFFER_TRACE(group_desc_bh, + "call ext4_handle_dirty_metadata"); + ret = ext4_handle_dirty_metadata(handle, NULL, + group_desc_bh); + +err_out: + up_write(&grp->alloc_sem); + ext4_journal_stop(handle); +out: + return ret; +} diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 00000000..830e1b2b --- /dev/null +++ b/fs/ext4/indirect.c @@ -0,0 +1,1502 @@ +/* + * linux/fs/ext4/indirect.c + * + * from + * + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + */ + +#include "ext4_jbd2.h" +#include "truncate.h" + +#include + +typedef struct { + __le32 *p; + __le32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +/** + * ext4_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * @boundary: set this non-zero if the referred-to block is likely to be + * followed (on disk) by an indirect block. + * + * To store the locations of file's data ext4 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext4_block_to_path(struct inode *inode, + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) +{ + int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT4_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + int final = 0; + + if (i_block < direct_blocks) { + offsets[n++] = i_block; + final = direct_blocks; + } else if ((i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT4_IND_BLOCK; + offsets[n++] = i_block; + final = ptrs; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT4_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT4_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + final = ptrs; + } else { + ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); + } + if (boundary) + *boundary = final - 1 - (i_block & (ptrs - 1)); + return n; +} + +/** + * ext4_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + * + * Need to be called with + * down_read(&EXT4_I(inode)->i_data_sem) + */ +static Indirect *ext4_get_branch(struct inode *inode, int depth, + ext4_lblk_t *offsets, + Indirect chain[4], int *err) +{ + struct super_block *sb = inode->i_sb; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = sb_getblk(sb, le32_to_cpu(p->key)); + if (unlikely(!bh)) + goto failure; + + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto failure; + } + /* validate block references */ + if (ext4_check_indirect_blockref(inode, bh)) { + put_bh(bh); + goto failure; + } + } + + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext4_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the preferred place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * + * In the latter case we colour the starting block by the callers PID to + * prevent it from clashing with concurrent allocations for a different inode + * in the same block group. The PID is used here so that functionally related + * files will be close-by on-disk. + * + * Caller must make sure that @ind is valid and will stay that way. + */ +static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; + __le32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) { + if (*p) + return le32_to_cpu(*p); + } + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be referred to from the inode itself? OK, just put it + * into the same cylinder group then. + */ + return ext4_inode_to_goal_block(inode); +} + +/** + * ext4_find_goal - find a preferred place for allocation. + * @inode: owner + * @block: block we want + * @partial: pointer to the last triple within a chain + * + * Normally this function find the preferred place for block allocation, + * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. + */ +static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) +{ + ext4_fsblk_t goal; + + /* + * XXX need to get goal block from mballoc's data structures + */ + + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; +} + +/** + * ext4_blks_to_allocate - Look up the block map and count the number + * of direct blocks need to be allocated for the given branch. + * + * @branch: chain of indirect blocks + * @k: number of blocks need for indirect blocks + * @blks: number of data blocks to be mapped. + * @blocks_to_boundary: the offset in the indirect block + * + * return the total number of blocks to be allocate, including the + * direct and indirect blocks. + */ +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, + int blocks_to_boundary) +{ + unsigned int count = 0; + + /* + * Simple case, [t,d]Indirect block(s) has not allocated yet + * then it's clear blocks on that path have not allocated + */ + if (k > 0) { + /* right now we don't handle cross boundary allocation */ + if (blks < blocks_to_boundary + 1) + count += blks; + else + count += blocks_to_boundary + 1; + return count; + } + + count++; + while (count < blks && count <= blocks_to_boundary && + le32_to_cpu(*(branch[0].p + count)) == 0) { + count++; + } + return count; +} + +/** + * ext4_alloc_blocks: multiple allocate blocks needed for a branch + * @handle: handle for this transaction + * @inode: inode which needs allocated blocks + * @iblock: the logical block to start allocated at + * @goal: preferred physical block of allocation + * @indirect_blks: the number of blocks need to allocate for indirect + * blocks + * @blks: number of desired blocks + * @new_blocks: on return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block, + * @err: on return it will store the error code + * + * This function will return the number of blocks allocated as + * requested by the passed-in parameters. + */ +static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) +{ + struct ext4_allocation_request ar; + int target, i; + unsigned long count = 0, blk_allocated = 0; + int index = 0; + ext4_fsblk_t current_block = 0; + int ret = 0; + + /* + * Here we try to allocate the requested multiple blocks at once, + * on a best-effort basis. + * To build a branch, we should allocate blocks for + * the indirect blocks(if not allocated yet), and at least + * the first direct block of this branch. That's the + * minimum number of blocks need to allocate(required) + */ + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { + count = target; + /* allocating blocks for indirect blocks and direct blocks */ + current_block = ext4_new_meta_blocks(handle, inode, goal, + 0, &count, err); + if (*err) + goto failed_out; + + if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + count %lu > %d!", + current_block, count, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { + new_blocks[index++] = current_block++; + count--; + } + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); + break; + } + } + + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { + EXT4_ERROR_INODE(inode, + "current_block %llu + ar.len %d > %d!", + current_block, ar.len, + EXT4_MAX_BLOCK_FILE_PHYS); + *err = -EIO; + goto failed_out; + } + + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += ar.len; + } +allocated: + /* total number of blocks allocated for direct blocks */ + ret = blk_allocated; + *err = 0; + return ret; +failed_out: + for (i = 0; i < index; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + return ret; +} + +/** + * ext4_alloc_branch - allocate and set up a chain of blocks. + * @handle: handle for this transaction + * @inode: owner + * @indirect_blks: number of allocated indirect blocks + * @blks: number of allocated direct blocks + * @goal: preferred place for allocation + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext4_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext4_get_block(), except that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ +static int ext4_alloc_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int i, n = 0; + int err = 0; + struct buffer_head *bh; + int num; + ext4_fsblk_t new_blocks[4]; + ext4_fsblk_t current_block; + + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, + *blks, new_blocks, &err); + if (err) + return err; + + branch[0].key = cpu_to_le32(new_blocks[0]); + /* + * metadata blocks and data blocks are allocated. + */ + for (n = 1; n <= indirect_blks; n++) { + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -EIO; + goto failed; + } + + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext4_journal_get_create_access(handle, bh); + if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ + unlock_buffer(bh); + goto failed; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (__le32 *) bh->b_data + offsets[n]; + branch[n].key = cpu_to_le32(new_blocks[n]); + *branch[n].p = branch[n].key; + if (n == indirect_blks) { + current_block = new_blocks[n]; + /* + * End of chain, update the last new metablock of + * the chain to point to the new allocated + * data blocks numbers + */ + for (i = 1; i < num; i++) + *(branch[n].p + i) = cpu_to_le32(++current_block); + } + BUFFER_TRACE(bh, "marking uptodate"); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (err) + goto failed; + } + *blks = num; + return err; +failed: + /* Allocation failed, free what we already allocated */ + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); + for (i = 1; i <= n ; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, + EXT4_FREE_BLOCKS_FORGET); + } + for (i = n+1; i < indirect_blks; i++) + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); + + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); + + return err; +} + +/** + * ext4_splice_branch - splice the allocated branch onto inode. + * @handle: handle for this transaction + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext4_alloc_branch) + * @where: location of missing link + * @num: number of indirect blocks we are adding + * @blks: number of direct blocks we are adding + * + * This function fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. + */ +static int ext4_splice_branch(handle_t *handle, struct inode *inode, + ext4_lblk_t block, Indirect *where, int num, + int blks) +{ + int i; + int err = 0; + ext4_fsblk_t current_block; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* That's it */ + + *where->p = where->key; + + /* + * Update the host buffer_head or inode to point to more just allocated + * direct blocks blocks + */ + if (num == 0 && blks > 1) { + current_block = le32_to_cpu(where->key) + 1; + for (i = 1; i < blks; i++) + *(where->p + i) = cpu_to_le32(current_block++); + } + + /* We are done with atomic stuff, now do the rest of housekeeping */ + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + */ + ext4_mark_inode_dirty(handle, inode); + jbd_debug(5, "splicing direct\n"); + } + return err; + +err_out: + for (i = 1; i <= num; i++) { + /* + * branch[i].bh is newly allocated, so there is no + * need to revoke the block, which is why we don't + * need to set EXT4_FREE_BLOCKS_METADATA. + */ + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + EXT4_FREE_BLOCKS_FORGET); + } + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), + blks, 0); + + return err; +} + +/* + * The ext4_ind_map_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_map_blocks(). + * + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * `handle' can be NULL if create == 0. + * + * return > 0, # of blocks mapped or allocated. + * return = 0, if plain lookup failed. + * return < 0, error case. + * + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. + */ +int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + int flags) +{ + int err = -EIO; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + ext4_fsblk_t goal; + int indirect_blks; + int blocks_to_boundary = 0; + int depth; + int count = 0; + ext4_fsblk_t first_block = 0; + + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); + J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, map->m_lblk, offsets, + &blocks_to_boundary); + + if (depth == 0) + goto out; + + partial = ext4_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + first_block = le32_to_cpu(chain[depth - 1].key); + count++; + /*map more blocks*/ + while (count < map->m_len && count <= blocks_to_boundary) { + ext4_fsblk_t blk; + + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) + count++; + else + break; + } + goto got_it; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) + goto cleanup; + + /* + * Okay, we need to do block allocation. + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + EXT4_ERROR_INODE(inode, "Can't allocate blocks for " + "non-extent mapped inodes with bigalloc"); + return -ENOSPC; + } + + goal = ext4_find_goal(inode, map->m_lblk, partial); + + /* the number of blocks need to allocate for [d,t]indirect blocks */ + indirect_blks = (chain + depth) - partial - 1; + + /* + * Next look up the indirect map to count the totoal number of + * direct blocks to allocate for this branch. + */ + count = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* + * Block out ext4_truncate while we alter the tree + */ + err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); + + /* + * The ext4_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct + */ + if (!err) + err = ext4_splice_branch(handle, inode, map->m_lblk, + partial, indirect_blks, count); + if (err) + goto cleanup; + + map->m_flags |= EXT4_MAP_NEW; + + ext4_update_inode_fsync_trans(handle, inode, 1); +got_it: + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = le32_to_cpu(chain[depth-1].key); + map->m_len = count; + if (count > blocks_to_boundary) + map->m_flags |= EXT4_MAP_BOUNDARY; + err = count; + /* Clean up and exit */ + partial = chain + depth - 1; /* the whole chain */ +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); + return err; +} + +/* + * O_DIRECT for ext3 (or indirect map) based files + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + * If the O_DIRECT write is intantiating holes inside i_size and the machine + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle; + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); + int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; + + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } + } + +retry: + if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (unlikely(!list_empty(&ei->i_completed_io_list))) { + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL, NULL, 0); + } else { + ret = blockdev_direct_IO(rw, iocb, inode, iov, + offset, nr_segs, ext4_get_block); + + if (unlikely((rw & WRITE) && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = offset + iov_length(iov, nr_segs); + + if (end > isize) + ext4_truncate_failed_write(inode); + } + } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a new block at @lblocks for non extent file based file + */ +int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) + return 0; + + lblock -= EXT4_NDIR_BLOCKS; + + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; +} + +int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block + */ + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ +static handle_t *start_transaction(struct inode *inode) +{ + handle_t *result; + + result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; + + ext4_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) + return 0; + return 1; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(__le32 *p, __le32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext4_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext4_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext4_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is referred + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext4_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext4_find_shared(struct inode *inode, int depth, + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offset + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext4_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext4. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while (partial > p) { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. + */ +static int ext4_clear_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) +{ + __le32 *p; + int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; + int err; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, + count)) { + EXT4_ERROR_INODE(inode, "attempt to clear invalid " + "blocks %llu len %lu", + (unsigned long long) block_to_free, count); + return 1; + } + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (unlikely(err)) + goto out_err; + } + err = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err)) + goto out_err; + err = ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + if (unlikely(err)) + goto out_err; + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; + } + } + + for (p = first; p < last; p++) + *p = 0; + + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); + return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; +} + +/** + * ext4_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks referred from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext4_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, + __le32 *first, __le32 *last) +{ + ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + __le32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + ext4_fsblk_t nr; /* Current block # */ + __le32 *p; /* Pointer into inode/ind + for current block */ + int err = 0; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) + break; + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); + + /* + * The buffer head should have an attached journal head at this + * point. However, if the data is corrupted and an indirect + * block pointed to itself, it would have been detached when + * the block was cleared. Check for this instead of OOPSing. + */ + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) + ext4_handle_dirty_metadata(handle, inode, this_bh); + else + EXT4_ERROR_INODE(inode, + "circular indirect block detected at " + "block %llu", + (unsigned long long) this_bh->b_blocknr); + } +} + +/** + * ext4_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks referred from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext4_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + __le32 *first, __le32 *last, int depth) +{ + ext4_fsblk_t nr; + __le32 *p; + + if (ext4_handle_is_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), + nr, 1)) { + EXT4_ERROR_INODE(inode, + "invalid indirect mapped " + "block %lu (level %d)", + (unsigned long) nr, depth); + break; + } + + /* Go read the buffer for the next level down */ + bh = sb_bread(inode->i_sb, nr); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext4_free_branches(handle, inode, bh, + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); + brelse(bh); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (ext4_handle_is_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); + ext4_truncate_restart_trans(handle, inode, + ext4_blocks_for_truncate(inode)); + } + + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ + ext4_free_blocks(handle, inode, NULL, nr, 1, + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext4_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext4_free_data(handle, inode, parent_bh, first, last); + } +} + +void ext4_ind_truncate(struct inode *inode) +{ + handle_t *handle; + struct ext4_inode_info *ei = EXT4_I(inode); + __le32 *i_data = ei->i_data; + int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + ext4_lblk_t offsets[4]; + Indirect chain[4]; + Indirect *partial; + __le32 nr = 0; + int n = 0; + ext4_lblk_t last_block, max_block; + loff_t page_len; + unsigned blocksize = inode->i_sb->s_blocksize; + int err; + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + last_block = (inode->i_size + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); + + if (inode->i_size % PAGE_CACHE_SIZE != 0) { + page_len = PAGE_CACHE_SIZE - + (inode->i_size & (PAGE_CACHE_SIZE - 1)); + + err = ext4_discard_partial_page_buffers(handle, + mapping, inode->i_size, page_len, 0); + + if (err) + goto out_stop; + } + + if (last_block != max_block) { + n = ext4_block_to_path(inode, last_block, offsets, NULL); + if (n == 0) + goto out_stop; /* error */ + } + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext4_orphan_add(handle, inode)) + goto out_stop; + + /* + * From here we block out all ext4_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext4 *really* writes onto the disk inode. + */ + ei->i_disksize = inode->i_size; + + if (last_block == max_block) { + /* + * It is unnecessary to free any data blocks if last_block is + * equal to the indirect block limit. + */ + goto out_unlock; + } else if (n == 1) { /* direct blocks */ + ext4_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT4_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext4_free_branches(handle, inode, partial->bh, partial->p + 1, + (__le32*)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT4_IND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); + i_data[EXT4_IND_BLOCK] = 0; + } + case EXT4_IND_BLOCK: + nr = i_data[EXT4_DIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); + i_data[EXT4_DIND_BLOCK] = 0; + } + case EXT4_DIND_BLOCK: + nr = i_data[EXT4_TIND_BLOCK]; + if (nr) { + ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); + i_data[EXT4_TIND_BLOCK] = 0; + } + case EXT4_TIND_BLOCK: + ; + } + +out_unlock: + up_write(&ei->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + + /* + * In a multi-transaction truncate, we only make the final transaction + * synchronous + */ + if (IS_SYNC(inode)) + ext4_handle_sync(handle); +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext4_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); +} + diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c new file mode 100644 index 00000000..c77b0bd2 --- /dev/null +++ b/fs/ext4/inode.c @@ -0,0 +1,4676 @@ +/* + * linux/fs/ext4/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "truncate.h" + +#include + +#define MPAGE_DA_EXTENT_TAIL 0x01 + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + trace_ext4_begin_ordered_truncate(inode, new_size); + /* + * If jinode is zero, then we never opened the file for + * writing, so there's no need to call + * jbd2_journal_begin_ordered_truncate() since there's no + * outstanding writes we need to flush. + */ + if (!EXT4_I(inode)->jinode) + return 0; + return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); +static int __ext4_journalled_writepage(struct page *page, unsigned int len); +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags); + +/* + * Test whether an inode is a fast symlink. + */ +static int ext4_inode_is_fast_symlink(struct inode *inode) +{ + int ea_blocks = EXT4_I(inode)->i_file_acl ? + (inode->i_sb->s_blocksize >> 9) : 0; + + return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) +{ + int ret; + + /* + * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + jbd_debug(2, "restarting handle %p\n", handle); + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, nblocks); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + return ret; +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext4_evict_inode(struct inode *inode) +{ + handle_t *handle; + int err; + + trace_ext4_evict_inode(inode); + + ext4_ioend_wait(inode); + + if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the + * journal. So although mm thinks everything is clean and + * ready for reaping the inode might still have some pages to + * write in the running transaction or waiting to be + * checkpointed. Thus calling jbd2_journal_invalidatepage() + * (via truncate_inode_pages()) to discard these buffers can + * cause data loss. Also even if we did not discard these + * buffers, we would have no way to find them after the inode + * is reaped and thus user could see stale data if he tries to + * read them before the transaction is checkpointed. So be + * careful and force everything to disk here... We use + * ei->i_datasync_tid to store the newest transaction + * containing inode's data. + * + * Note that directories do not have this problem because they + * don't use page cache. + */ + if (ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; + + jbd2_log_start_commit(journal, commit_tid); + jbd2_log_wait_commit(journal, commit_tid); + filemap_write_and_wait(&inode->i_data); + } + truncate_inode_pages(&inode->i_data, 0); + goto no_delete; + } + + if (!is_bad_inode(inode)) + dquot_initialize(inode); + + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); + truncate_inode_pages(&inode->i_data, 0); + + if (is_bad_inode(inode)) + goto no_delete; + + handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); + if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); + /* + * If we're going to skip the normal cleanup, we still need to + * make sure that the in-core orphan linked list is properly + * cleaned up. + */ + ext4_orphan_del(NULL, inode); + goto no_delete; + } + + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } + if (inode->i_blocks) + ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (!ext4_handle_has_enough_credits(handle, 3)) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); + goto no_delete; + } + } + + /* + * Kill off the orphan record which ext4_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext4_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext4_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext4_orphan_del(handle, inode); + EXT4_I(inode)->i_dtime = get_seconds(); + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext4_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + ext4_clear_inode(inode); + else + ext4_free_inode(handle, inode); + ext4_journal_stop(handle); + return; +no_delete: + ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) +{ + return &EXT4_I(inode)->i_reserved_quota; +} +#endif + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate a block located at @lblock + */ +static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return ext4_ext_calc_metadata_amount(inode, lblock); + + return ext4_ind_calc_metadata_amount(inode, lblock); +} + +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + spin_lock(&ei->i_block_reservation_lock); + trace_ext4_da_update_reserve_space(inode, used, quota_claim); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + "with only %d reserved data blocks", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + used + ei->i_allocated_meta_blocks); + ei->i_allocated_meta_blocks = 0; + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + ei->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + /* Update quota subsystem for data blocks */ + if (quota_claim) + dquot_claim_block(inode, EXT4_C2B(sbi, used)); + else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not re-claim the quota for fallocated blocks. + */ + dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); + } + + /* + * If we have done all the pending block allocations and if + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) + ext4_discard_preallocations(inode); +} + +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, + struct ext4_map_blocks *map) +{ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, + map->m_len)) { + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock " + "(length %d)", (unsigned long) map->m_lblk, + map->m_len); + return -EIO; + } + return 0; +} + +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + +/* + * Return the number of contiguous dirty pages in a given inode + * starting at page frame idx. + */ +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct pagevec pvec; + pgoff_t num = 0; + int i, nr_pages, done = 0; + + if (max_pages == 0) + return 0; + pagevec_init(&pvec, 0); + while (!done) { + index = idx; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page) || + page->index != idx) { + done = 1; + unlock_page(page); + break; + } + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) + done = 1; + bh = bh->b_this_page; + } while (!done && (bh != head)); + } + unlock_page(page); + if (done) + break; + idx++; + num++; + if (num >= max_pages) { + done = 1; + break; + } + } + pagevec_release(&pvec); + } + return num; +} + +/* + * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. + */ +static void set_buffers_da_mapped(struct inode *inode, + struct ext4_map_blocks *map) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + int i, nr_pages; + pgoff_t index, end; + + index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (map->m_lblk + map->m_len - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, + min(end - index + 1, + (pgoff_t)PAGEVEC_SIZE)); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + if (unlikely(page->mapping != mapping) || + !PageDirty(page)) + break; + + if (page_has_buffers(page)) { + bh = head = page_buffers(page); + do { + set_buffer_da_mapped(bh); + bh = bh->b_this_page; + } while (bh != head); + } + index++; + } + pagevec_release(&pvec); + } +} + +/* + * The ext4_map_blocks() function tries to look up the requested blocks, + * and returns if the blocks are already mapped. + * + * Otherwise it takes the write lock of the i_data_sem and allocate blocks + * and store the allocated blocks in the result buffer head and mark it + * mapped. + * + * If file type is extents based, it will call ext4_ext_map_blocks(), + * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping + * based files + * + * On success, it returns the number of blocks being mapped or allocate. + * if create==0 and the blocks are pre-allocated and uninitialized block, + * the result buffer head is unmapped. If the create ==1, it will make sure + * the buffer head is mapped. + * + * It returns 0 if plain look up failed (blocks have not been allocated), in + * that case, buffer head is unmapped + * + * It returns the error in case of allocation failure. + */ +int ext4_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + int retval; + + map->m_flags = 0; + ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," + "logical block %lu\n", inode->i_ino, flags, map->m_len, + (unsigned long) map->m_lblk); + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + up_read((&EXT4_I(inode)->i_data_sem)); + + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + + /* If it is only a block(s) look up */ + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) + return retval; + + /* + * Returns if the blocks have already allocated + * + * Note that if blocks have been preallocated + * ext4_ext_get_block() returns the create = 0 + * with buffer head unmapped. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + return retval; + + /* + * When we call get_blocks without the create flag, the + * BH_Unwritten flag could have gotten set if the blocks + * requested were part of a uninitialized extent. We need to + * clear this flag now that we are committed to convert all or + * part of the uninitialized extent to be an initialized + * extent. This is because we need to avoid the combination + * of BH_Unwritten and BH_Mapped flags being simultaneously + * set on the buffer_head. + */ + map->m_flags &= ~EXT4_MAP_UNWRITTEN; + + /* + * New blocks allocate and/or writing to uninitialized extent + * will possibly result in updating i_data, so we take + * the write lock of i_data_sem, and call get_blocks() + * with create == 1 flag. + */ + down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + /* + * We need to check for EXT4 here because migrate + * could have changed the inode type in between + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { + /* + * We allocated new blocks which will result in + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); + + /* If we have successfully mapped the delayed allocated blocks, + * set the BH_Da_Mapped bit on them. Its important to do this + * under the protection of i_data_sem. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) + set_buffers_da_mapped(inode, map); + } + + up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret = check_block_validity(inode, map); + if (ret != 0) + return ret; + } + return retval; +} + +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +static int _ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int flags) +{ + handle_t *handle = ext4_journal_current_handle(); + struct ext4_map_blocks map; + int ret = 0, started = 0; + int dio_credits; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + if (flags && !handle) { + /* Direct IO write... */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); + handle = ext4_journal_start(inode, dio_credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + started = 1; + } + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret > 0) { + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + bh->b_size = inode->i_sb->s_blocksize * map.m_len; + ret = 0; + } + if (started) + ext4_journal_stop(handle); + return ret; +} + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + return _ext4_get_block(inode, iblock, bh, + create ? EXT4_GET_BLOCKS_CREATE : 0); +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create, int *errp) +{ + struct ext4_map_blocks map; + struct buffer_head *bh; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + map.m_lblk = block; + map.m_len = 1; + err = ext4_map_blocks(handle, inode, &map, + create ? EXT4_GET_BLOCKS_CREATE : 0); + + if (err < 0) + *errp = err; + if (err <= 0) + return NULL; + *errp = 0; + + bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!bh) { + *errp = -EIO; + return NULL; + } + if (map.m_flags & EXT4_MAP_NEW) { + J_ASSERT(create != 0); + J_ASSERT(handle != NULL); + + /* + * Now that we do not always journal data, we should + * keep in mind whether this should always journal the + * new buffer as metadata. For now, regular file + * writes use ext4_get_block instead, so it's not a + * problem. + */ + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext4_journal_get_create_access(handle, bh); + if (!fatal && !buffer_uptodate(bh)) { + memset(bh->b_data, 0, inode->i_sb->s_blocksize); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); + if (!fatal) + fatal = err; + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; +} + +struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, + ext4_lblk_t block, int create, int *err) +{ + struct buffer_head *bh; + + bh = ext4_getblk(handle, inode, block, create, err); + if (!bh) + return bh; + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the commit_write(). So doing the jbd2_journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext4_writepage() -> + * block_write_full_page(). In that case, we *know* that ext4_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext4 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that jbd2_journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + int dirty = buffer_dirty(bh); + int ret; + + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + /* + * __block_write_begin() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_write_begin() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + return ret; +} + +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +static int ext4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + int ret, needed_blocks; + handle_t *handle; + int retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; + + trace_ext4_write_begin(inode, pos, len, flags); + /* + * Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason + */ + needed_blocks = ext4_writepage_trans_blocks(inode) + 1; + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + +retry: + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, pos, len, ext4_get_block_write); + else + ret = __block_write_begin(page, pos, len, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + /* + * __block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before + * truncate finishes + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ext4_journal_stop(handle); + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) +{ + if (!buffer_mapped(bh) || buffer_freed(bh)) + return 0; + set_buffer_uptodate(bh); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + +static int ext4_generic_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i_size_changed = 0; + struct inode *inode = mapping->host; + handle_t *handle = ext4_journal_current_handle(); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos + copied > inode->i_size) { + i_size_write(inode, pos + copied); + i_size_changed = 1; + } + + if (pos + copied > EXT4_I(inode)->i_disksize) { + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_update_i_disksize(inode, (pos + copied)); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + ext4_mark_inode_dirty(handle, inode); + + return copied; +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from page_symlink(). + * + * ext4 never places buffers on inode->i_mapping->private_list. metadata + * buffers are managed internally. + */ +static int ext4_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + + trace_ext4_ordered_write_end(inode, pos, len, copied); + ret = ext4_jbd2_file_inode(handle, inode); + + if (ret == 0) { + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + if (ret2 < 0) + ret = ret2; + } else { + unlock_page(page); + page_cache_release(page); + } + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + + return ret ? ret : copied; +} + +static int ext4_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + + trace_ext4_writeback_write_end(inode, pos, len, copied); + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + if (ret2 < 0) + ret = ret2; + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +static int ext4_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + handle_t *handle = ext4_journal_current_handle(); + struct inode *inode = mapping->host; + int ret = 0, ret2; + int partial = 0; + unsigned from, to; + loff_t new_i_size; + + trace_ext4_journalled_write_end(inode, pos, len, copied); + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + BUG_ON(!ext4_handle_valid(handle)); + + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } + + ret = walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + new_i_size = pos + copied; + if (new_i_size > inode->i_size) + i_size_write(inode, pos+copied); + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; +} + +/* + * Reserve a single cluster located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + int ret; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + spin_unlock(&ei->i_block_reservation_lock); + + /* + * We will charge metadata quota at writeout time; this saves + * us from metadata over-estimation, though we may go over by + * a small amount in the end. Here we just reserve for data. + */ + ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); + if (ret) + return ret; + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { + dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } + return -ENOSPC; + } + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + +static void ext4_da_release_space(struct inode *inode, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + trace_ext4_da_release_space(inode, to_free); + if (unlikely(to_free > ei->i_reserved_data_blocks)) { + /* + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. + */ + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; + } + ei->i_reserved_data_blocks -= to_free; + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + * Note that in case of bigalloc, i_reserved_meta_blocks, + * i_reserved_data_blocks, etc. refer to number of clusters. + */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + ei->i_reserved_meta_blocks); + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } + + /* update fs dirty data blocks counter */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + struct inode *inode = page->mapping->host; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int num_clusters; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + clear_buffer_da_mapped(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + + /* If we have released all the blocks belonging to a cluster, then we + * need to release the reserved space for that cluster. */ + num_clusters = EXT4_NUM_B2C(sbi, to_release); + while (num_clusters > 0) { + ext4_fsblk_t lblk; + lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + + ((num_clusters - 1) << sbi->s_cluster_bits); + if (sbi->s_cluster_ratio == 1 || + !ext4_find_delalloc_cluster(inode, lblk, 1)) + ext4_da_release_space(inode, 1); + + num_clusters--; + } +} + +/* + * Delayed allocation stuff + */ + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with writepage() call back + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd, + struct ext4_map_blocks *map) +{ + struct pagevec pvec; + unsigned long index, end; + int ret = 0, err, nr_pages, i; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + loff_t size = i_size_read(inode); + unsigned int len, block_start; + struct buffer_head *bh, *page_bufs = NULL; + int journal_data = ext4_should_journal_data(inode); + sector_t pblock = 0, cur_logical = 0; + struct ext4_io_submit io_submit; + + BUG_ON(mpd->next_page <= mpd->first_page); + memset(&io_submit, 0, sizeof(io_submit)); + /* + * We need to start from the first_page to the next_page - 1 + * to make sure we also write the mapped dirty buffer_heads. + * If we look at mpd->b_blocknr we would only be looking + * at the currently mapped buffer_heads. + */ + index = mpd->first_page; + end = mpd->next_page - 1; + + pagevec_init(&pvec, 0); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + int commit_write = 0, skip_page = 0; + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + + if (index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + if (map) { + cur_logical = index << (PAGE_CACHE_SHIFT - + inode->i_blkbits); + pblock = map->m_pblk + (cur_logical - + map->m_lblk); + } + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + /* + * If the page does not have buffers (for + * whatever reason), try to create them using + * __block_write_begin. If this fails, + * skip the page and move on. + */ + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + skip_page: + unlock_page(page); + continue; + } + commit_write = 1; + } + + bh = page_bufs = page_buffers(page); + block_start = 0; + do { + if (!bh) + goto skip_page; + if (map && (cur_logical >= map->m_lblk) && + (cur_logical <= (map->m_lblk + + (map->m_len - 1)))) { + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock; + } + if (buffer_da_mapped(bh)) + clear_buffer_da_mapped(bh); + if (buffer_unwritten(bh) || + buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + if (map->m_flags & EXT4_MAP_UNINIT) + set_buffer_uninit(bh); + clear_buffer_unwritten(bh); + } + + /* + * skip page if block allocation undone and + * block is dirty + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) + skip_page = 1; + bh = bh->b_this_page; + block_start += bh->b_size; + cur_logical++; + pblock++; + } while (bh != page_bufs); + + if (skip_page) + goto skip_page; + + if (commit_write) + /* mark the buffer_heads as dirty & uptodate */ + block_commit_write(page, 0, len); + + clear_page_dirty_for_io(page); + /* + * Delalloc doesn't support data journalling, + * but eventually maybe we'll lift this + * restriction. + */ + if (unlikely(journal_data && PageChecked(page))) + err = __ext4_journalled_writepage(page, len); + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) + err = ext4_bio_write_page(&io_submit, page, + len, mpd->wbc); + else if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + err = block_write_full_page_endio(page, + noalloc_get_block_write, + mpd->wbc, ext4_end_io_buffer_write); + } else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); + + if (!err) + mpd->pages_written++; + /* + * In error case, we have to continue because + * remaining pages are still locked + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + ext4_io_submit(&io_submit); + return ret; +} + +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +{ + int nr_pages, i; + pgoff_t index, end; + struct pagevec pvec; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + index = mpd->first_page; + end = mpd->next_page - 1; + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + if (page->index > end) + break; + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + block_invalidatepage(page, 0); + ClearPageUptodate(page); + unlock_page(page); + } + index = pvec.pages[nr_pages - 1]->index + 1; + pagevec_release(&pvec); + } + return; +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + + ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", + EXT4_C2B(EXT4_SB(inode->i_sb), + ext4_count_free_clusters(inode->i_sb))); + ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); + ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_freeclusters_counter))); + ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", + (long long) EXT4_C2B(EXT4_SB(inode->i_sb), + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); + ext4_msg(sb, KERN_CRIT, "Block reservation details"); + ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", + EXT4_I(inode)->i_reserved_data_blocks); + ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", + EXT4_I(inode)->i_reserved_meta_blocks); + return; +} + +/* + * mpage_da_map_and_submit - go through given space, map them + * if necessary, and then submit them for I/O + * + * @mpd - bh describing space + * + * The function skips space we know is already mapped to disk blocks. + * + */ +static void mpage_da_map_and_submit(struct mpage_da_data *mpd) +{ + int err, blks, get_blocks_flags; + struct ext4_map_blocks map, *mapp = NULL; + sector_t next = mpd->b_blocknr; + unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; + loff_t disksize = EXT4_I(mpd->inode)->i_disksize; + handle_t *handle = NULL; + + /* + * If the blocks are mapped already, or we couldn't accumulate + * any blocks, then proceed immediately to the submission stage. + */ + if ((mpd->b_size == 0) || + ((mpd->b_state & (1 << BH_Mapped)) && + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten)))) + goto submit_io; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + + /* + * Call ext4_map_blocks() to allocate any delayed allocation + * blocks, or to convert an uninitialized extent to be + * initialized (in the case where we have written into + * one or more preallocated blocks). + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to + * indicate that we are on the delayed allocation path. This + * affects functions in many different parts of the allocation + * call path. This flag exists primarily because we don't + * want to change *many* call functions, so ext4_map_blocks() + * will set the EXT4_STATE_DELALLOC_RESERVED flag once the + * inode's allocation semaphore is taken. + * + * If the blocks in questions were delalloc blocks, set + * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting + * variables are updated after the blocks have been allocated. + */ + map.m_lblk = next; + map.m_len = max_blocks; + get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (ext4_should_dioread_nolock(mpd->inode)) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (mpd->b_state & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + + blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); + if (blks < 0) { + struct super_block *sb = mpd->inode->i_sb; + + err = blks; + /* + * If get block returns EAGAIN or ENOSPC and there + * appears to be free blocks we will just let + * mpage_da_submit_io() unlock all of the pages. + */ + if (err == -EAGAIN) + goto submit_io; + + if (err == -ENOSPC && ext4_count_free_clusters(sb)) { + mpd->retval = err; + goto submit_io; + } + + /* + * get block failure will cause us to loop in + * writepages, because a_ops->writepage won't be able + * to make progress. The page will be redirtied by + * writepage and writepages will again try to write + * the same. + */ + if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_msg(sb, KERN_CRIT, + "delayed block allocation failed for inode %lu " + "at logical offset %llu with max blocks %zd " + "with error %d", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(mpd->inode); + } + /* invalidate all the pages */ + ext4_da_block_invalidatepages(mpd); + + /* Mark this page range as having been completed */ + mpd->io_done = 1; + return; + } + BUG_ON(blks == 0); + + mapp = ↦ + if (map.m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = mpd->inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map.m_len; i++) + unmap_underlying_metadata(bdev, map.m_pblk + i); + + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) { + /* Only if the journal is aborted */ + mpd->retval = err; + goto submit_io; + } + } + } + + /* + * Update on-disk size along with block allocation. + */ + disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; + if (disksize > i_size_read(mpd->inode)) + disksize = i_size_read(mpd->inode); + if (disksize > EXT4_I(mpd->inode)->i_disksize) { + ext4_update_i_disksize(mpd->inode, disksize); + err = ext4_mark_inode_dirty(handle, mpd->inode); + if (err) + ext4_error(mpd->inode->i_sb, + "Failed to mark inode %lu dirty", + mpd->inode->i_ino); + } + +submit_io: + mpage_da_submit_io(mpd, mapp); + mpd->io_done = 1; +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, size_t b_size, + unsigned long b_state) +{ + sector_t next; + int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + + /* + * XXX Don't go larger than mballoc is willing to allocate + * This is a stopgap solution. We eventually need to fold + * mpage_da_submit_io() into this function and then call + * ext4_map_blocks() multiple times in a loop + */ + if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) + goto flush_it; + + /* check if thereserved journal credits might overflow */ + if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } + /* + * First block in the extent + */ + if (mpd->b_size == 0) { + mpd->b_blocknr = logical; + mpd->b_size = b_size; + mpd->b_state = b_state & BH_FLAGS; + return; + } + + next = mpd->b_blocknr + nrblocks; + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { + mpd->b_size += b_size; + return; + } + +flush_it: + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + mpage_da_map_and_submit(mpd); + return; +} + +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) +{ + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); +} + +/* + * This function is grabs code from the very beginning of + * ext4_map_blocks, but assumes that the caller is from delayed write + * time. This function looks up the requested blocks and sets the + * buffer delay bit under the protection of i_data_sem. + */ +static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, + struct ext4_map_blocks *map, + struct buffer_head *bh) +{ + int retval; + sector_t invalid_block = ~((sector_t) 0xffff); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; + + map->m_flags = 0; + ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," + "logical block %lu\n", inode->i_ino, map->m_len, + (unsigned long) map->m_lblk); + /* + * Try to see if we can get the block without requesting a new + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); + else + retval = ext4_ind_map_blocks(NULL, inode, map, 0); + + if (retval == 0) { + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + /* If the block was allocated from previously allocated cluster, + * then we dont need to reserve it again. */ + if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + retval = ext4_da_reserve_space(inode, iblock); + if (retval) + /* not enough space to reserve */ + goto out_unlock; + } + + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served + * and it should not appear on the bh->b_state. + */ + map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; + + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + } + +out_unlock: + up_read((&EXT4_I(inode)->i_data_sem)); + + return retval; +} + +/* + * This is a special get_blocks_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + struct ext4_map_blocks map; + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh->b_size != inode->i_sb->s_blocksize); + + map.m_lblk = iblock; + map.m_len = 1; + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_da_map_blocks(inode, iblock, &map, bh); + if (ret <= 0) + return ret; + + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + + if (buffer_unwritten(bh)) { + /* A delayed write to unwritten bh should be marked + * new and mapped. Mapped ensures that we don't do + * get_block multiple times when we write to the same + * offset and new ensures that we do proper zero out + * for partial write. + */ + set_buffer_new(bh); + set_buffer_mapped(bh); + } + return 0; +} + +/* + * This function is used as a standard get_block_t calback function + * when there is no desire to allocate any blocks. It is used as a + * callback function for block_write_begin() and block_write_full_page(). + * These functions should only try to map a single block at a time. + * + * Since this function doesn't do block allocations even if the caller + * requests it by passing in create=1, it is critically important that + * any caller checks to make sure that any buffer heads are returned + * by this function are either all already mapped or marked for + * delayed allocation before calling block_write_full_page(). Otherwise, + * b_blocknr could be left unitialized, and the page write functions will + * be taken by surprise. + */ +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + return _ext4_get_block(inode, iblock, bh_result, 0); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + unsigned int len) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + ClearPageChecked(page); + page_bufs = page_buffers(page); + BUG_ON(!page_bufs); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + BUG_ON(!ext4_handle_valid(handle)); + + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + ext4_set_inode_state(inode, EXT4_STATE_JDATA); +out: + return ret; +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); + +/* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), no one guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * + * This function can get called via... + * - ext4_da_writepages after taking page lock (have journal handle) + * - journal_submit_inode_data_buffers (no journal handle) + * - shrink_page_list via pdflush (no journal handle) + * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other buffer_heads would be unmapped but dirty (dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. + */ +static int ext4_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0, commit_write = 0; + loff_t size; + unsigned int len; + struct buffer_head *page_bufs = NULL; + struct inode *inode = page->mapping->host; + + trace_ext4_writepage(page); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + /* + * If the page does not have buffers (for whatever reason), + * try to create them using __block_write_begin. If this + * fails, redirty the page and move on. + */ + if (!page_has_buffers(page)) { + if (__block_write_begin(page, 0, len, + noalloc_get_block_write)) { + redirty_page: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + commit_write = 1; + } + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { + /* + * We don't want to do block allocation, so redirty + * the page and return. We may reach here when we do + * a journal commit via journal_submit_inode_data_buffers. + * We can also reach here via shrink_page_list but it + * should never be for direct reclaim so warn if that + * happens + */ + WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC); + goto redirty_page; + } + if (commit_write) + /* now mark the buffer_heads as dirty and uptodate */ + block_commit_write(page, 0, len); + + if (PageChecked(page) && ext4_should_journal_data(inode)) + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + return __ext4_journalled_writepage(page, len); + + if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + ret = block_write_full_page_endio(page, noalloc_get_block_write, + wbc, ext4_end_io_buffer_write); + } else + ret = block_write_full_page(page, noalloc_get_block_write, + wbc); + + return ret; +} + +/* + * This is called via ext4_da_writepages() to + * calculate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. + */ + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} + +/* + * write_cache_pages_da - walk the list of dirty pages of the given + * address space and accumulate pages that need writing, and call + * mpage_da_map_and_submit to map a single contiguous memory region + * and then write them. + */ +static int write_cache_pages_da(struct address_space *mapping, + struct writeback_control *wbc, + struct mpage_da_data *mpd, + pgoff_t *done_index) +{ + struct buffer_head *bh, *head; + struct inode *inode = mapping->host; + struct pagevec pvec; + unsigned int nr_pages; + sector_t logical; + pgoff_t index, end; + long nr_to_write = wbc->nr_to_write; + int i, tag, ret = 0; + + memset(mpd, 0, sizeof(struct mpage_da_data)); + mpd->wbc = wbc; + mpd->inode = inode; + pagevec_init(&pvec, 0); + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + + *done_index = index; + while (index <= end) { + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + return 0; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) + goto out; + + *done_index = page->index + 1; + + /* + * If we can't merge this page, and we have + * accumulated an contiguous region, write it + */ + if ((mpd->next_page != page->index) && + (mpd->next_page != mpd->first_page)) { + mpage_da_map_and_submit(mpd); + goto ret_extent_tail; + } + + lock_page(page); + + /* + * If the page is no longer dirty, or its + * mapping no longer corresponds to inode we + * are writing (which means it has been + * truncated or invalidated), or the page is + * already under writeback and we are not + * doing a data integrity writeback, skip the page + */ + if (!PageDirty(page) || + (PageWriteback(page) && + (wbc->sync_mode == WB_SYNC_NONE)) || + unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + wait_on_page_writeback(page); + BUG_ON(PageWriteback(page)); + + if (mpd->next_page != page->index) + mpd->first_page = page->index; + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + mpage_add_bh_to_extent(mpd, logical, + PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + if (mpd->io_done) + goto ret_extent_tail; + } else { + /* + * Page with regular buffer heads, + * just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_writepage + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); + if (mpd->io_done) + goto ret_extent_tail; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need + * to update the b_state + * because we look at b_state + * in mpage_da_map_blocks. We + * don't update b_size because + * if we find an unmapped + * buffer_head later we need to + * use the b_state flag of that + * buffer_head. + */ + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); + } + + if (nr_to_write > 0) { + nr_to_write--; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + goto out; + } + } + pagevec_release(&pvec); + cond_resched(); + } + return 0; +ret_extent_tail: + ret = MPAGE_DA_EXTENT_TAIL; +out: + pagevec_release(&pvec); + cond_resched(); + return ret; +} + + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + pgoff_t index; + int range_whole = 0; + handle_t *handle = NULL; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int pages_written = 0; + unsigned int max_pages; + int range_cyclic, cycled = 1, io_done = 0; + int needed_blocks, ret = 0; + long desired_nr_to_write, nr_to_writebump = 0; + loff_t range_start = wbc->range_start; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t done_index = 0; + pgoff_t end; + struct blk_plug plug; + + trace_ext4_da_writepages(inode, wbc); + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_da_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) + return -EROFS; + + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { + index = mapping->writeback_index; + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + } + + /* + * This works around two forms of stupidity. The first is in + * the writeback code, which caps the maximum number of pages + * written to be 1024 pages. This is wrong on multiple + * levels; different architectues have a different page size, + * which changes the maximum amount of data which gets + * written. Secondly, 4 megabytes is way too small. XFS + * forces this value to be 16 megabytes by multiplying + * nr_to_write parameter by four, and then relies on its + * allocator to allocate larger extents to make them + * contiguous. Unfortunately this brings us to the second + * stupidity, which is that ext4's mballoc code only allocates + * at most 2048 blocks. So we force contiguous writes up to + * the number of dirty blocks in the inode, or + * sbi->max_writeback_mb_bump whichever is smaller. + */ + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); + if (!range_cyclic && range_whole) { + if (wbc->nr_to_write == LONG_MAX) + desired_nr_to_write = wbc->nr_to_write; + else + desired_nr_to_write = wbc->nr_to_write * 8; + } else + desired_nr_to_write = ext4_num_dirty_pages(inode, index, + max_pages); + if (desired_nr_to_write > max_pages) + desired_nr_to_write = max_pages; + + if (wbc->nr_to_write < desired_nr_to_write) { + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; + wbc->nr_to_write = desired_nr_to_write; + } + +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + + blk_start_plug(&plug); + while (!ret && wbc->nr_to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + + /* start a new transaction*/ + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " + "%ld pages, ino %lu; err %d", __func__, + wbc->nr_to_write, inode->i_ino, ret); + blk_finish_plug(&plug); + goto out_writepages; + } + + /* + * Now call write_cache_pages_da() to find the next + * contiguous region of logical blocks that need + * blocks to be allocated by ext4 and submit them. + */ + ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); + /* + * If we have a contiguous extent of pages and we + * haven't done the I/O yet, map the blocks and submit + * them for I/O. + */ + if (!mpd.io_done && mpd.next_page != mpd.first_page) { + mpage_da_map_and_submit(&mpd); + ret = MPAGE_DA_EXTENT_TAIL; + } + trace_ext4_da_write_pages(inode, &mpd); + wbc->nr_to_write -= mpd.pages_written; + + ext4_journal_stop(handle); + + if ((mpd.retval == -ENOSPC) && sbi->s_journal) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * Got one extent now try with rest of the pages. + * If mpd.retval is set -EIO, journal is aborted. + * So we don't need to write any more. + */ + pages_written += mpd.pages_written; + ret = mpd.retval; + io_done = 1; + } else if (wbc->nr_to_write) + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ + break; + } + blk_finish_plug(&plug); + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } + + /* Update index */ + wbc->range_cyclic = range_cyclic; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = done_index; + +out_writepages: + wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + return ret; +} + +#define FALL_BACK_TO_NONDELALLOC 1 +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_blocks, dirty_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with percpu_counter_batch getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_blocks = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { + /* + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark + */ + return 1; + } + /* + * Even if we don't switch but are nearing capacity, + * start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (free_blocks < 2 * dirty_blocks) + writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); + + return 0; +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret, retries = 0; + struct page *page; + pgoff_t index; + struct inode *inode = mapping->host; + handle_t *handle; + + index = pos >> PAGE_CACHE_SHIFT; + + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; + trace_ext4_da_write_begin(inode, pos, len, flags); +retry: + /* + * With delayed allocation, we don't log the i_disksize update + * if there is delayed block allocation. But we still need + * to journalling the i_disksize update if writes to the end + * of file which has an already mapped buffer. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + + ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); + if (ret < 0) { + unlock_page(page); + ext4_journal_stop(handle); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + ext4_truncate_failed_write(inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; +out: + return ret; +} + +/* + * Check if we should update i_disksize + * when write to the end of file but not require block allocation + */ +static int ext4_da_should_update_i_disksize(struct page *page, + unsigned long offset) +{ + struct buffer_head *bh; + struct inode *inode = page->mapping->host; + unsigned int idx; + int i; + + bh = page_buffers(page); + idx = offset >> inode->i_blkbits; + + for (i = 0; i < idx; i++) + bh = bh->b_this_page; + + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) + return 0; + return 1; +} + +static int ext4_da_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + int ret = 0, ret2; + handle_t *handle = ext4_journal_current_handle(); + loff_t new_i_size; + unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) { + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + return ext4_ordered_write_end(file, mapping, pos, + len, copied, page, fsdata); + case EXT4_INODE_WRITEBACK_DATA_MODE: + return ext4_writeback_write_end(file, mapping, pos, + len, copied, page, fsdata); + default: + BUG(); + } + } + + trace_ext4_da_write_end(inode, pos, len, copied); + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + copied - 1; + + /* + * generic_write_end() will run mark_inode_dirty() if i_size + * changes. So let's piggyback the i_disksize mark_inode_dirty + * into that. + */ + + new_i_size = pos + copied; + if (copied && new_i_size > EXT4_I(inode)->i_disksize) { + if (ext4_da_should_update_i_disksize(page, end)) { + down_write(&EXT4_I(inode)->i_data_sem); + if (new_i_size > EXT4_I(inode)->i_disksize) { + /* + * Updating i_disksize when extending file + * without needing block allocation + */ + if (ext4_should_order_data(inode)) + ret = ext4_jbd2_file_inode(handle, + inode); + + EXT4_I(inode)->i_disksize = new_i_size; + } + up_write(&EXT4_I(inode)->i_data_sem); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + } + ret2 = generic_write_end(file, mapping, pos, len, copied, + page, fsdata); + copied = ret2; + if (ret2 < 0) + ret = ret2; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + + return ret ? ret : copied; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + ext4_da_page_release_reservation(page, offset); + +out: + ext4_invalidatepage(page, offset); + + return; +} + +/* + * Force all delayed allocation blocks to be allocated for a given inode. + */ +int ext4_alloc_da_blocks(struct inode *inode) +{ + trace_ext4_alloc_da_blocks(inode); + + if (!EXT4_I(inode)->i_reserved_data_blocks && + !EXT4_I(inode)->i_reserved_meta_blocks) + return 0; + + /* + * We do something simple for now. The filemap_flush() will + * also start triggering a write of the data blocks, which is + * not strictly speaking necessary (and for users of + * laptop_mode, not even desirable). However, to do otherwise + * would require replicating code paths in: + * + * ext4_da_writepages() -> + * write_cache_pages() ---> (via passed in callback function) + * __mpage_da_writepage() --> + * mpage_add_bh_to_extent() + * mpage_da_map_blocks() + * + * The problem is that write_cache_pages(), located in + * mm/page-writeback.c, marks pages clean in preparation for + * doing I/O, which is not desirable if we're not planning on + * doing I/O at all. + * + * We could call write_cache_pages(), and then redirty all of + * the pages by calling redirty_page_for_writepage() but that + * would be ugly in the extreme. So instead we would need to + * replicate parts of the code in the above functions, + * simplifying them because we wouldn't actually intend to + * write out the pages, but rather only collect contiguous + * logical block extents, call the multi-block allocator, and + * then update the buffer heads with the block allocations. + * + * For now, though, we'll cheat by calling filemap_flush(), + * which will map the blocks, and start the I/O, but not + * actually wait for the I/O to complete. + */ + return filemap_flush(inode->i_mapping); +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext4 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static sector_t ext4_bmap(struct address_space *mapping, sector_t block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { + /* + * With delalloc we want to sync the file + * so that we can make sure we allocate + * blocks for file + */ + filemap_write_and_wait(mapping); + } + + if (EXT4_JOURNAL(inode) && + ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT4_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + ext4_clear_inode_state(inode, EXT4_STATE_JDATA); + journal = EXT4_JOURNAL(inode); + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping, block, ext4_get_block); +} + +static int ext4_readpage(struct file *file, struct page *page) +{ + trace_ext4_readpage(page); + return mpage_readpage(page, ext4_get_block); +} + +static int +ext4_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); +} + +static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + if (!page_has_buffers(page)) + return; + head = bh = page_buffers(page); + do { + if (offset <= curr_off && test_clear_buffer_uninit(bh) + && bh->b_private) { + ext4_free_io_end(bh->b_private); + bh->b_private = NULL; + bh->b_end_io = NULL; + } + curr_off = curr_off + bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_invalidatepage(page, offset); + + /* + * free any io_end structure allocated for buffers to be discarded + */ + if (ext4_should_dioread_nolock(page->mapping->host)) + ext4_invalidatepage_free_endio(page, offset); + /* + * If it's a full truncate we just forget about the pending dirtying + */ + if (offset == 0) + ClearPageChecked(page); + + if (journal) + jbd2_journal_invalidatepage(journal, page, offset); + else + block_invalidatepage(page, offset); +} + +static int ext4_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_releasepage(page); + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); +} + +/* + * ext4_get_block used when preparing for a DIO write or buffer write. + * We allocate an uinitialized extent if blocks haven't been allocated. + * The extent will be converted to initialized after the IO is complete. + */ +static int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_IO_CREATE_EXT); +} + +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private, int ret, + bool is_async) +{ + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + unsigned long flags; + struct ext4_inode_info *ei; + + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + goto out; + + ext_debug("ext4_end_io_dio(): io_end 0x%p " + "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); + + iocb->private = NULL; + + /* if not aio dio with unwritten extents, just free io and return */ + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); +out: + if (is_async) + aio_complete(iocb, ret, 0); + inode_dio_done(inode); + return; + } + + io_end->offset = offset; + io_end->size = size; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + + /* Add the io_end to per-inode completed aio dio list*/ + ei = EXT4_I(io_end->inode); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &ei->i_completed_io_list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +} + +static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) +{ + ext4_io_end_t *io_end = bh->b_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + + if (!test_clear_buffer_uninit(bh) || !io_end) + goto out; + + if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { + ext4_msg(io_end->inode->i_sb, KERN_INFO, + "sb umounted, discard end_io request for inode %lu", + io_end->inode->i_ino); + ext4_free_io_end(io_end); + goto out; + } + + /* + * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, + * but being more careful is always safe for the future change. + */ + inode = io_end->inode; + ext4_set_io_unwritten_flag(inode, io_end); + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +out: + bh->b_private = NULL; + bh->b_end_io = NULL; + clear_buffer_uninit(bh); + end_buffer_async_write(bh, uptodate); +} + +static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; + size_t size = bh->b_size; + +retry: + io_end = ext4_init_io_end(inode, GFP_ATOMIC); + if (!io_end) { + pr_warn_ratelimited("%s: allocation fail\n", __func__); + schedule(); + goto retry; + } + io_end->offset = offset; + io_end->size = size; + /* + * We need to hold a reference to the page to make sure it + * doesn't get evicted before ext4_end_io_work() has a chance + * to convert the extent from written to unwritten. + */ + io_end->page = page; + get_page(io_end->page); + + bh->b_private = io_end; + bh->b_end_io = ext4_end_io_buffer_write; + return 0; +} + +/* + * For ext4 extent files, ext4 will do direct-io write to holes, + * preallocated extents, and those write extend the file, no need to + * fall back to buffered IO. + * + * For holes, we fallocate those blocks, mark them as uninitialized + * If those blocks were preallocated, we mark sure they are splited, but + * still keep the range to write as uninitialized. + * + * The unwrritten extents will be converted to written when DIO is completed. + * For async direct IO, since the IO may still pending when return, we + * set up an end_io call back function, which will do the conversion + * when async direct IO completed. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. + * + */ +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + size_t count = iov_length(iov, nr_segs); + + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as uninitialized + * to prevent parallel buffered read to expose the stale data + * before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * + * for non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * for async DIO, the conversion needs to be defered when + * the IO is completed. The ext4 end_io callback function + * will be called to take care of the conversion work. + * Here for async case, we allocate an io_end structure to + * hook to the iocb. + */ + iocb->private = NULL; + EXT4_I(inode)->cur_aio_dio = NULL; + if (!is_sync_kiocb(iocb)) { + ext4_io_end_t *io_end = + ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; + /* + * we save the io structure for current async + * direct IO, so that later ext4_map_blocks() + * could flag the io structure whether there + * is a unwritten extents needs to be converted + * when IO is completed. + */ + EXT4_I(inode)->cur_aio_dio = iocb->private; + } + + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_write, + ext4_end_io_dio, + NULL, + DIO_LOCKING); + if (iocb->private) + EXT4_I(inode)->cur_aio_dio = NULL; + /* + * The io_end structure takes a reference to the inode, + * that structure needs to be destroyed and the + * reference to the inode need to be dropped, when IO is + * complete, even with 0 byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will be + * desctroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since + * VFS direct IO won't invoke the end_io call back function, + * we need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } + return ret; + } + + /* for write the the end of file case, we fall back to old way */ + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +} + +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + /* + * If we are doing data journalling we don't support O_DIRECT + */ + if (ext4_should_journal_data(inode)) + return 0; + + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + else + ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + trace_ext4_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); + return ret; +} + +/* + * Pages can be marked dirty completely asynchronously from ext4's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do + * much here because ->set_page_dirty is called under VFS locks. The page is + * not necessarily locked. + * + * We cannot just dirty the page and leave attached buffers clean, because the + * buffers' dirty state is "definitive". We cannot just set the buffers dirty + * or jbddirty because all the journalling code will explode. + * + * So what we do is to mark the page "pending dirty" and next time writepage + * is called, propagate that into the buffers appropriately. + */ +static int ext4_journalled_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); +} + +static const struct address_space_operations ext4_ordered_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .write_begin = ext4_write_begin, + .write_end = ext4_ordered_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_writeback_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .write_begin = ext4_write_begin, + .write_end = ext4_writeback_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_journalled_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writepage, + .writepages = ext4_da_writepages, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void ext4_set_aops(struct inode *inode) +{ + switch (ext4_inode_journal_mode(inode)) { + case EXT4_INODE_ORDERED_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_ordered_aops; + break; + case EXT4_INODE_WRITEBACK_DATA_MODE: + if (test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; + else + inode->i_mapping->a_ops = &ext4_writeback_aops; + break; + case EXT4_INODE_JOURNAL_DATA_MODE: + inode->i_mapping->a_ops = &ext4_journalled_aops; + break; + default: + BUG(); + } +} + + +/* + * ext4_discard_partial_page_buffers() + * Wrapper function for ext4_discard_partial_page_buffers_no_lock. + * This function finds and locks the page containing the offset + * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. + * Calling functions that already have the page locked should call + * ext4_discard_partial_page_buffers_no_lock directly. + */ +int ext4_discard_partial_page_buffers(handle_t *handle, + struct address_space *mapping, loff_t from, + loff_t length, int flags) +{ + struct inode *inode = mapping->host; + struct page *page; + int err = 0; + + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return -ENOMEM; + + err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, + from, length, flags); + + unlock_page(page); + page_cache_release(page); + return err; +} + +/* + * ext4_discard_partial_page_buffers_no_lock() + * Zeros a page range of length 'length' starting from offset 'from'. + * Buffer heads that correspond to the block aligned regions of the + * zeroed range will be unmapped. Unblock aligned regions + * will have the corresponding buffer head mapped if needed so that + * that region of the page can be updated with the partial zero out. + * + * This function assumes that the page has already been locked. The + * The range to be discarded must be contained with in the given page. + * If the specified range exceeds the end of the page it will be shortened + * to the end of the page that corresponds to 'from'. This function is + * appropriate for updating a page and it buffer heads to be unmapped and + * zeroed for blocks that have been either released, or are going to be + * released. + * + * handle: The journal handle + * inode: The files inode + * page: A locked page that contains the offset "from" + * from: The starting byte offset (from the begining of the file) + * to begin discarding + * len: The length of bytes to discard + * flags: Optional flags that may be used: + * + * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED + * Only zero the regions of the page whose buffer heads + * have already been unmapped. This flag is appropriate + * for updateing the contents of a page whose blocks may + * have already been released, and we only want to zero + * out the regions that correspond to those released blocks. + * + * Returns zero on sucess or negative on failure. + */ +static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, + struct inode *inode, struct page *page, loff_t from, + loff_t length, int flags) +{ + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned int offset = from & (PAGE_CACHE_SIZE-1); + unsigned int blocksize, max, pos; + ext4_lblk_t iblock; + struct buffer_head *bh; + int err = 0; + + blocksize = inode->i_sb->s_blocksize; + max = PAGE_CACHE_SIZE - offset; + + if (index != page->index) + return -EINVAL; + + /* + * correct length if it does not fall between + * 'from' and the end of the page + */ + if (length > max || length < 0) + length = max; + + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + pos = offset; + while (pos < offset + length) { + unsigned int end_of_block, range_to_discard; + + err = 0; + + /* The length of space left to zero and unmap */ + range_to_discard = offset + length - pos; + + /* The length of space until the end of the block */ + end_of_block = blocksize - (pos & (blocksize-1)); + + /* + * Do not unmap or zero past end of block + * for this buffer head + */ + if (range_to_discard > end_of_block) + range_to_discard = end_of_block; + + + /* + * Skip this buffer head if we are only zeroing unampped + * regions of the page + */ + if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && + buffer_mapped(bh)) + goto next; + + /* If the range is block aligned, unmap */ + if (range_to_discard == blocksize) { + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); + clear_buffer_uptodate(bh); + zero_user(page, pos, range_to_discard); + BUFFER_TRACE(bh, "Buffer discarded"); + goto next; + } + + /* + * If this block is not completely contained in the range + * to be discarded, then it is not going to be released. Because + * we need to keep this block, we need to make sure this part + * of the page is uptodate before we modify it by writeing + * partial zeros on it. + */ + if (!buffer_mapped(bh)) { + /* + * Buffer head must be mapped before we can read + * from the block + */ + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto next; + } + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt.*/ + if (!buffer_uptodate(bh)) + goto next; + } + + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto next; + } + + zero_user(page, pos, range_to_discard); + + err = 0; + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else + mark_buffer_dirty(bh); + + BUFFER_TRACE(bh, "Partial buffer zeroed"); +next: + bh = bh->b_this_page; + iblock++; + pos += range_to_discard; + } + + return err; +} + +int ext4_can_truncate(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return 1; + if (S_ISDIR(inode->i_mode)) + return 1; + if (S_ISLNK(inode->i_mode)) + return !ext4_inode_is_fast_symlink(inode); + return 0; +} + +/* + * ext4_punch_hole: punches a hole in a file by releaseing the blocks + * associated with the given offset and length + * + * @inode: File inode + * @offset: The offset where the hole will begin + * @len: The length of the hole + * + * Returns: 0 on sucess or negative on failure + */ + +int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file->f_path.dentry->d_inode; + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + /* TODO: Add support for non extent hole punching */ + return -EOPNOTSUPP; + } + + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { + /* TODO: Add support for bigalloc file systems */ + return -EOPNOTSUPP; + } + + return ext4_ext_punch_hole(file, offset, length); +} + +/* + * ext4_truncate() + * + * We block out ext4_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext4_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext4_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext4 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext4_truncate() run will find them and release them. + */ +void ext4_truncate(struct inode *inode) +{ + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) + return; + + ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_ext_truncate(inode); + else + ext4_ind_truncate(inode); + + trace_ext4_truncate_exit(inode); +} + +/* + * ext4_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If 'in_mem' is true, we have all + * data in memory that is needed to recreate the on-disk version of this + * inode. + */ +static int __ext4_get_inode_loc(struct inode *inode, + struct ext4_iloc *iloc, int in_mem) +{ + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = NULL; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; + + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) + return -EIO; + + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); + if (!bh) { + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); + return -EIO; + } + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + + if (buffer_uptodate(bh)) { + /* someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; + + start = inode_offset & ~(inodes_per_block - 1); + + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (!bitmap_bh) + goto make_io; + + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { + brelse(bitmap_bh); + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } + +make_io: + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + if (table > b) + b = table; + end = b + EXT4_SB(sb)->s_inode_readahead_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(inode); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); + brelse(bh); + return -EIO; + } + } +has_buffer: + iloc->bh = bh; + return 0; +} + +int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) +{ + /* We have all inode data except xattrs in memory here. */ + return __ext4_get_inode_loc(inode, iloc, + !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); +} + +void ext4_set_inode_flags(struct inode *inode) +{ + unsigned int flags = EXT4_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (flags & EXT4_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & EXT4_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & EXT4_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & EXT4_NOATIME_FL) + inode->i_flags |= S_NOATIME; + if (flags & EXT4_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ +void ext4_get_inode_flags(struct ext4_inode_info *ei) +{ + unsigned int vfs_fl; + unsigned long old_fl, new_fl; + + do { + vfs_fl = ei->vfs_inode.i_flags; + old_fl = ei->i_flags; + new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| + EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| + EXT4_DIRSYNC_FL); + if (vfs_fl & S_SYNC) + new_fl |= EXT4_SYNC_FL; + if (vfs_fl & S_APPEND) + new_fl |= EXT4_APPEND_FL; + if (vfs_fl & S_IMMUTABLE) + new_fl |= EXT4_IMMUTABLE_FL; + if (vfs_fl & S_NOATIME) + new_fl |= EXT4_NOATIME_FL; + if (vfs_fl & S_DIRSYNC) + new_fl |= EXT4_DIRSYNC_FL; + } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); +} + +static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + blkcnt_t i_blocks ; + struct inode *inode = &(ei->vfs_inode); + struct super_block *sb = inode->i_sb; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + /* we are using combined 48 bit field */ + i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | + le32_to_cpu(raw_inode->i_blocks_lo); + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { + /* i_blocks represent file system block size */ + return i_blocks << (inode->i_blkbits - 9); + } else { + return i_blocks; + } + } else { + return le32_to_cpu(raw_inode->i_blocks_lo); + } +} + +struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +{ + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; + struct inode *inode; + journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + int block; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ei = EXT4_I(inode); + iloc.bh = NULL; + + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (!(test_opt(inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); + + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + /* this inode is deleted */ + ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + ei->i_flags = le32_to_cpu(raw_inode->i_flags); + inode->i_blocks = ext4_inode_blocks(raw_inode, ei); + ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; + inode->i_size = ext4_isize(raw_inode); + ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + ei->i_block_group = iloc.block_group; + ei->i_last_alloc_group = ~0; + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT4_N_BLOCKS; block++) + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + + /* + * Set transaction id's of transactions that have to be committed + * to finish f[data]sync. We set them to currently running transaction + * as we cannot be sure that the inode or some of its metadata isn't + * part of the transaction - the inode could have been reclaimed and + * now it is reread from disk. + */ + if (journal) { + transaction_t *transaction; + tid_t tid; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction) + transaction = journal->j_running_transaction; + else + transaction = journal->j_committing_transaction; + if (transaction) + tid = transaction->t_tid; + else + tid = journal->j_commit_sequence; + read_unlock(&journal->j_state_lock); + ei->i_sync_tid = tid; + ei->i_datasync_tid = tid; + } + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + ret = -EIO; + goto bad_inode; + } + if (ei->i_extra_isize == 0) { + /* The extra space is currently unused. Use it. */ + ei->i_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + } else { + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } + } else + ei->i_extra_isize = 0; + + EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); + + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } + + ret = 0; + if (ei->i_file_acl && + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ei->i_file_acl); + ret = -EIO; + goto bad_inode; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); + } + if (ret) + goto bad_inode; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (ext4_inode_is_fast_symlink(inode)) { + inode->i_op = &ext4_fast_symlink_inode_operations; + nd_terminate_link(ei->i_data, inode->i_size, + sizeof(ei->i_data) - 1); + } else { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + } + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + inode->i_op = &ext4_special_inode_operations; + if (raw_inode->i_block[0]) + init_special_inode(inode, inode->i_mode, + old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); + else + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else { + ret = -EIO; + EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + goto bad_inode; + } + brelse(iloc.bh); + ext4_set_inode_flags(inode); + unlock_new_inode(inode); + return inode; + +bad_inode: + brelse(iloc.bh); + iget_failed(inode); + return ERR_PTR(ret); +} + +static int ext4_inode_blocks_set(handle_t *handle, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + struct inode *inode = &(ei->vfs_inode); + u64 i_blocks = inode->i_blocks; + struct super_block *sb = inode->i_sb; + + if (i_blocks <= ~0U) { + /* + * i_blocks can be represnted in a 32 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = 0; + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { + /* + * i_blocks can be represented in a 48 bit variable + * as multiple of 512 bytes + */ + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); + } else { + ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); + /* i_block is stored in file system block size */ + i_blocks = i_blocks >> (inode->i_blkbits - 9); + raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); + raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); + } + return 0; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + * + * The caller must have write access to iloc->bh. + */ +static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + ext4_get_inode_flags(ei); + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if (!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if (!ei->i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + + EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); + + if (ext4_inode_blocks_set(handle, raw_inode, ei)) + goto out_brelse; + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); + raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_HURD)) + raw_inode->i_file_acl_high = + cpu_to_le16(ei->i_file_acl >> 32); + raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); + ext4_isize_set(raw_inode, ei->i_disksize); + if (ei->i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT4_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT4_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext4_journal_get_write_access(handle, + EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } + } + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + raw_inode->i_block[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + raw_inode->i_block[1] = 0; + } else { + raw_inode->i_block[0] = 0; + raw_inode->i_block[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + raw_inode->i_block[2] = 0; + } + } else + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; + + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, NULL, bh); + if (!err) + err = rc; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + + ext4_update_inode_fsync_trans(handle, inode, 0); +out_brelse: + brelse(bh); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int err; + + if (current->flags & PF_MEMALLOC) + return 0; + + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } + + if (wbc->sync_mode != WB_SYNC_ALL) + return 0; + + err = ext4_force_commit(inode->i_sb); + } else { + struct ext4_iloc iloc; + + err = __ext4_get_inode_loc(inode, &iloc, 0); + if (err) + return err; + if (wbc->sync_mode == WB_SYNC_ALL) + sync_dirty_buffer(iloc.bh); + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, + "IO error syncing inode"); + err = -EIO; + } + brelse(iloc.bh); + } + return err; +} + +/* + * ext4_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * Another thing we have to assure is that if we are in ordered mode + * and inode is still attached to the committing transaction, we must + * we start writeout of all the dirty pages which are being truncated. + * This way we are sure that all the data written in the previous + * transaction are already on disk (truncate waits for pages under + * writeback). + * + * Called with inode->i_mutex down. + */ +int ext4_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc = 0; + int orphan = 0; + const unsigned int ia_valid = attr->ia_valid; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (is_quota_modification(inode, attr)) + dquot_initialize(inode); + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + handle_t *handle; + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + error = dquot_transfer(inode, attr); + if (error) { + ext4_journal_stop(handle); + return error; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + error = ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + if (attr->ia_valid & ATTR_SIZE) { + inode_dio_wait(inode); + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (attr->ia_size > sbi->s_bitmap_maxbytes) + return -EFBIG; + } + } + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size)) { + handle_t *handle; + + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; + } + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext4_journal_stop(handle); + + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, + attr->ia_size); + if (error) { + /* Do as much error cleanup as possible */ + handle = ext4_journal_start(inode, 3); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + ext4_orphan_del(handle, inode); + orphan = 0; + ext4_journal_stop(handle); + goto err_out; + } + } + } + + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(inode)) + truncate_setsize(inode, attr->ia_size); + ext4_truncate(inode); + } + + if (!rc) { + setattr_copy(inode, attr); + mark_inode_dirty(inode); + } + + /* + * If the call to ext4_truncate failed to get a transaction handle at + * all, we need to clean up the in-core orphan list manually. + */ + if (orphan && inode->i_nlink) + ext4_orphan_del(NULL, inode); + + if (!rc && (ia_valid & ATTR_MODE)) + rc = ext4_acl_chmod(inode); + +err_out: + ext4_std_error(inode->i_sb, error); + if (!error) + error = rc; + return error; +} + +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode; + unsigned long delalloc_blocks; + + inode = dentry->d_inode; + generic_fillattr(inode, stat); + + /* + * We can't update i_blocks if the block allocation is delayed + * otherwise in the case of system crash before the real block + * allocation is done, we will have i_blocks inconsistent with + * on-disk file blocks. + * We always keep i_blocks updated together with real + * allocation. But to not confuse with user, stat + * will return the blocks that include the delayed allocation + * blocks for this file. + */ + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + return 0; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return ext4_ind_trans_blocks(inode, nrblocks, chunk); + return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); +} + +/* + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups + * + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiuguous, with flexbg, + * they could still across block group boundary. + * + * Also account for superblock, inode, quota and xattr blocks + */ +static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > ngroups) + groups = ngroups; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calculate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. + * + * This could be called via ext4_write_begin() + * + * We need to consider the worse case, when + * one new block per extent. + */ +int ext4_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + int ret; + + ret = ext4_meta_trans_blocks(inode, bpp, 0); + + /* Account for data blocks for journalled mode */ + if (ext4_should_journal_data(inode)) + ret += bpp; + return ret; +} + +/* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* + * The caller must have previously called ext4_reserve_inode_write(). + * Give this, we know that the caller already has write access to iloc->bh. + */ +int ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, struct ext4_iloc *iloc) +{ + int err = 0; + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); + + /* the do_update_inode consumes one bh->b_count */ + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ + err = ext4_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc) +{ + int err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * Expand an inode by new_extra_isize bytes. + * Returns 0 on success or negative error number on failure. + */ +static int ext4_expand_extra_isize(struct inode *inode, + unsigned int new_extra_isize, + struct ext4_iloc iloc, + handle_t *handle) +{ + struct ext4_inode *raw_inode; + struct ext4_xattr_ibody_header *header; + + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) + return 0; + + raw_inode = ext4_raw_inode(&iloc); + + header = IHDR(inode, raw_inode); + + /* No extended attributes present */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, + new_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + return 0; + } + + /* try to expand with EAs present */ + return ext4_expand_extra_isize_ea(inode, new_extra_isize, + raw_inode, handle); +} + +/* + * What we do here is to mark the in-core inode as clean with respect to inode + * dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + static unsigned int mnt_count; + int err, ret; + + might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + /* + * We need extra buffer credits since we may write into EA block + * with this same handle. If journal_extend fails, then it will + * only result in a minor loss of functionality for that inode. + * If this is felt to be critical, then e2fsck should be run to + * force a large enough s_min_extra_isize. + */ + if ((jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + ret = ext4_expand_extra_isize(inode, + sbi->s_want_extra_isize, + iloc, handle); + if (ret) { + ext4_set_inode_state(inode, + EXT4_STATE_NO_EXPAND); + if (mnt_count != + le16_to_cpu(sbi->s_es->s_mnt_count)) { + ext4_warning(inode->i_sb, + "Unable to expand inode %lu. Delete" + " some EAs or run e2fsck.", + inode->i_ino); + mnt_count = + le16_to_cpu(sbi->s_es->s_mnt_count); + } + } + } + } + if (!err) + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * ext4_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, dquot_alloc_block() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext4_dirty_inode(struct inode *inode, int flags) +{ + handle_t *handle; + + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + + ext4_mark_inode_dirty(handle, inode); + + ext4_journal_stop(handle); +out: + return; +} + +#if 0 +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext4_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static int ext4_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext4_iloc iloc; + + int err = 0; + if (handle) { + err = ext4_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = jbd2_journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext4_handle_dirty_metadata(handle, + NULL, + iloc.bh); + brelse(iloc.bh); + } + } + ext4_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext4_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; + if (is_journal_aborted(journal)) + return -EROFS; + /* We have to allocate physical blocks for delalloc blocks + * before flushing journal. otherwise delalloc blocks can not + * be allocated any more. even more truncate on delalloc blocks + * could trigger BUG by flushing delalloc blocks in journal. + * There is no delalloc block in non-journal data mode. + */ + if (val && test_opt(inode->i_sb, DELALLOC)) { + err = ext4_alloc_da_blocks(inode); + if (err < 0) + return err; + } + + jbd2_journal_lock_updates(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + else { + jbd2_journal_flush(journal); + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + } + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext4_mark_inode_dirty(handle, inode); + ext4_handle_sync(handle); + ext4_journal_stop(handle); + ext4_std_error(inode->i_sb, err); + + return err; +} + +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +{ + return !buffer_mapped(bh); +} + +int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + loff_t size; + unsigned long len; + int ret; + struct file *file = vma->vm_file; + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + handle_t *handle; + get_block_t *get_block; + int retries = 0; + + /* + * This check is racy but catches the common case. We rely on + * __block_page_mkwrite() to do a reliable check. + */ + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + /* Delalloc case is easy... */ + if (test_opt(inode->i_sb, DELALLOC) && + !ext4_should_journal_data(inode) && + !ext4_nonda_switch(inode->i_sb)) { + do { + ret = __block_page_mkwrite(vma, vmf, + ext4_da_get_block_prep); + } while (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)); + goto out_ret; + } + + lock_page(page); + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (page->mapping != mapping || page_offset(page) > size) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + /* + * Return if we have all the buffers mapped. This avoids the need to do + * journal_start/journal_stop which can block and take a long time + */ + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, + ext4_bh_unmapped)) { + /* Wait so that we don't change page under IO */ + wait_on_page_writeback(page); + ret = VM_FAULT_LOCKED; + goto out; + } + } + unlock_page(page); + /* OK, we need to fill the hole... */ + if (ext4_should_dioread_nolock(inode)) + get_block = ext4_get_block_write; + else + get_block = ext4_get_block; +retry_alloc: + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + ret = __block_page_mkwrite(vma, vmf, get_block); + if (!ret && ext4_should_journal_data(inode)) { + if (walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + ext4_journal_stop(handle); + goto out; + } + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + } + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_alloc; +out_ret: + ret = block_page_mkwrite_return(ret); +out: + return ret; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c new file mode 100644 index 00000000..1365903a --- /dev/null +++ b/fs/ext4/ioctl.c @@ -0,0 +1,509 @@ +/* + * linux/fs/ext4/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) + +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int flags; + + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT4_IOC_GETFLAGS: + ext4_get_inode_flags(ei); + flags = ei->i_flags & EXT4_FL_USER_VISIBLE; + return put_user(flags, (int __user *) arg); + case EXT4_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext4_mask_flags(inode->i_mode, flags); + + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if (oldflags & EXT4_EXTENTS_FL) { + /* We don't support clearning extent flags */ + if (!(flags & EXT4_EXTENTS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (flags & EXT4_EXTENTS_FL) { + /* migrate the file */ + migrate = 1; + flags &= ~EXT4_EXTENTS_FL; + } + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) + err = ext4_ext_migrate(inode); +flags_out: + mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(filp); + return err; + } + case EXT4_IOC_GETVERSION: + case EXT4_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int __user *) arg); + case EXT4_IOC_SETVERSION: + case EXT4_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext4_iloc iloc; + __u32 generation; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + err = mnt_want_write_file(filp); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } + + mutex_lock(&inode->i_mutex); + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto unlock_out; + } + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err == 0) { + inode->i_ctime = ext4_current_time(inode); + inode->i_generation = generation; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + } + ext4_journal_stop(handle); + +unlock_out: + mutex_unlock(&inode->i_mutex); +setversion_out: + mnt_drop_write_file(filp); + return err; + } + case EXT4_IOC_GROUP_EXTEND: { + ext4_fsblk_t n_blocks_count; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_extend_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto group_extend_out; + + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); +group_extend_out: + ext4_resize_end(sb); + return err; + } + + case EXT4_IOC_MOVE_EXT: { + struct move_extent me; + struct file *donor_filp; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; + me.moved_len = 0; + + donor_filp = fget(me.donor_fd); + if (!donor_filp) + return -EBADF; + + if (!(donor_filp->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto mext_out; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with bigalloc"); + return -EOPNOTSUPP; + } + + err = mnt_want_write_file(filp); + if (err) + goto mext_out; + + err = ext4_move_extents(filp, donor_filp, me.orig_start, + me.donor_start, me.len, &me.moved_len); + mnt_drop_write_file(filp); + mnt_drop_write(filp->f_path.mnt); + + if (copy_to_user((struct move_extent __user *)arg, + &me, sizeof(me))) + err = -EFAULT; +mext_out: + fput(donor_filp); + return err; + } + + case EXT4_IOC_GROUP_ADD: { + struct ext4_new_group_data input; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto group_add_out; + + err = ext4_group_add(sb, &input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(filp); +group_add_out: + ext4_resize_end(sb); + return err; + } + + case EXT4_IOC_MIGRATE: + { + int err; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_ALLOC_DA_BLKS: + { + int err; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + err = ext4_alloc_da_blocks(inode); + mnt_drop_write_file(filp); + return err; + } + + case EXT4_IOC_RESIZE_FS: { + ext4_fsblk_t n_blocks_count; + struct super_block *sb = inode->i_sb; + int err = 0, err2 = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_META_BG)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not (yet) supported with meta_bg"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, + sizeof(__u64))) { + return -EFAULT; + } + + if (n_blocks_count > MAX_32_NUM && + !EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, + "File system only supports 32-bit block numbers"); + return -EOPNOTSUPP; + } + + err = ext4_resize_begin(sb); + if (err) + return err; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto resizefs_out; + + err = ext4_resize_fs(sb, n_blocks_count); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write(filp->f_path.mnt); +resizefs_out: + ext4_resize_end(sb); + return err; + } + + case FITRIM: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + ext4_msg(sb, KERN_ERR, + "FITRIM not supported with bigalloc"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case EXT4_IOC32_GETFLAGS: + cmd = EXT4_IOC_GETFLAGS; + break; + case EXT4_IOC32_SETFLAGS: + cmd = EXT4_IOC_SETFLAGS; + break; + case EXT4_IOC32_GETVERSION: + cmd = EXT4_IOC_GETVERSION; + break; + case EXT4_IOC32_SETVERSION: + cmd = EXT4_IOC_SETVERSION; + break; + case EXT4_IOC32_GROUP_EXTEND: + cmd = EXT4_IOC_GROUP_EXTEND; + break; + case EXT4_IOC32_GETVERSION_OLD: + cmd = EXT4_IOC_GETVERSION_OLD; + break; + case EXT4_IOC32_SETVERSION_OLD: + cmd = EXT4_IOC_SETVERSION_OLD; + break; + case EXT4_IOC32_GETRSVSZ: + cmd = EXT4_IOC_GETRSVSZ; + break; + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; + case EXT4_IOC32_GROUP_ADD: { + struct compat_ext4_new_group_input __user *uinput; + struct ext4_new_group_input input; + mm_segment_t old_fs; + int err; + + uinput = compat_ptr(arg); + err = get_user(input.group, &uinput->group); + err |= get_user(input.block_bitmap, &uinput->block_bitmap); + err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); + err |= get_user(input.inode_table, &uinput->inode_table); + err |= get_user(input.blocks_count, &uinput->blocks_count); + err |= get_user(input.reserved_blocks, + &uinput->reserved_blocks); + if (err) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, + (unsigned long) &input); + set_fs(old_fs); + return err; + } + case EXT4_IOC_MOVE_EXT: + case FITRIM: + case EXT4_IOC_RESIZE_FS: + break; + default: + return -ENOIOCTLCMD; + } + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c new file mode 100644 index 00000000..6b0a57ea --- /dev/null +++ b/fs/ext4/mballoc.c @@ -0,0 +1,5047 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include "ext4_jbd2.h" +#include "mballoc.h" +#include +#include +#include + +/* + * MUSTDO: + * - test ext4_ext_search_left() and ext4_ext_search_right() + * - search for metadata in few groups + * + * TODO v4: + * - normalization should take into account whether file is still open + * - discard preallocations if no free space left (policy?) + * - don't normalize tails + * - quota + * - reservation for superuser + * + * TODO v3: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - tree of groups sorted by number of free blocks + * - error handling + */ + +/* + * The allocation request involve request for multiple number of blocks + * near to the goal(block) value specified. + * + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4//mb_stream_req. The value is represented in + * terms of number of blocks. + * + * The main motivation for having small file use group preallocation is to + * ensure that we have small files closer together on the disk. + * + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: + * + * pa_lstart -> the logical start block for this prealloc space + * pa_pstart -> the physical start block for this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) + * + * The inode preallocation space is used looking at the _logical_ start + * block. If only the logical file block falls within the range of prealloc + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks + * + * The important thing to be noted in case of inode prealloc space is that + * we don't modify the values associated to inode prealloc space except + * pa_free. + * + * If we are not able to find blocks in the inode prealloc space and if we + * have the group allocation flag set then we look at the locality group + * prealloc space. These are per CPU prealloc list represented as + * + * ext4_sb_info.s_locality_groups[smp_processor_id()] + * + * The reason for having a per cpu locality group is to reduce the contention + * between CPUs. It is possible to get scheduled at this point. + * + * The locality group prealloc space is used looking at whether we have + * enough free space (pa_free) within the prealloc space. + * + * If we can't allocate blocks via inode prealloc or/and locality group + * prealloc then we look at the buddy cache. The buddy cache is represented + * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets + * mapped to the buddy and bitmap information regarding different + * groups. The buddy information is attached to buddy cache inode so that + * we can access them through the page cache. The information regarding + * each group is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are stored in the + * inode as: + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. So for each group we + * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / + * blocksize) blocks. So it can have information regarding groups_per_page + * which is blocks_per_page/2 + * + * The buddy cache inode is not stored on disk. The inode is thrown + * away when the filesystem is unmounted. + * + * We look for count number of blocks in the buddy cache. If we were able + * to locate that many free blocks we return with additional information + * regarding rest of the contiguous physical block available + * + * Before allocating blocks via buddy cache we normalize the request + * blocks. This ensure we ask for more blocks that we needed. The extra + * blocks that we get after allocation is added to the respective prealloc + * list. In case of inode preallocation we follow a list of heuristics + * based on file size. This can be found in ext4_mb_normalize_request. If + * we are doing a group prealloc we try to normalize the request to + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is + * 512 blocks. This can be tuned via + * /sys/fs/ext4//mb_group_prealloc. The value is represented in + * terms of number of blocks. If we have mounted the file system with -O + * stripe= option the group prealloc request is normalized to the + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. + * + * The regular allocator (using the buddy cache) supports a few tunables. + * + * /sys/fs/ext4//mb_min_to_scan + * /sys/fs/ext4//mb_max_to_scan + * /sys/fs/ext4//mb_order2_req + * + * The regular allocator uses buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The + * value of s_mb_order2_reqs can be tuned via + * /sys/fs/ext4//mb_order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contiguous block in + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. + * min_to_scan indicate how long the mballoc __must__ look for a best + * extent and max_to_scan indicates how long the mballoc __can__ look for a + * best extent in the found extents. Searching for the blocks starts with + * the group specified as the goal value in allocation context via + * ac_g_ex. Each group is first checked based on the criteria whether it + * can be used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the + * subsequent request. + */ + +/* + * mballoc operates on the following data: + * - on-disk bitmap + * - in-core buddy (actually includes buddy and bitmap) + * - preallocation descriptors (PAs) + * + * there are two types of preallocations: + * - inode + * assiged to specific inode and can be used for this inode only. + * it describes part of inode's space preallocated to specific + * physical blocks. any block from that preallocated can be used + * independent. the descriptor just tracks number of blocks left + * unused. so, before taking some block from descriptor, one must + * make sure corresponded logical block isn't allocated yet. this + * also means that freeing any block within descriptor's range + * must discard all preallocated blocks. + * - locality group + * assigned to specific locality group which does not translate to + * permanent set of inodes: inode can join and leave group. space + * from this type of preallocation can be used for any inode. thus + * it's consumed from the beginning to the end. + * + * relation between them can be expressed as: + * in-core buddy = on-disk bitmap + preallocation descriptors + * + * this mean blocks mballoc considers used are: + * - allocated blocks (persistent) + * - preallocated blocks (non-persistent) + * + * consistency in mballoc world means that at any time a block is either + * free or used in ALL structures. notice: "any time" should not be read + * literally -- time is discrete and delimited by locks. + * + * to keep it simple, we don't use block numbers, instead we count number of + * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. + * + * all operations can be expressed as: + * - init buddy: buddy = on-disk + PAs + * - new PA: buddy += N; PA = N + * - use inode PA: on-disk += N; PA -= N + * - discard inode PA buddy -= on-disk - PA; PA = 0 + * - use locality group PA on-disk += N; PA -= N + * - discard locality group PA buddy -= PA; PA = 0 + * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap + * is used in real operation because we can't know actual used + * bits from PA, only from on-disk bitmap + * + * if we follow this strict logic, then all operations above should be atomic. + * given some of them can block, we'd have to use something like semaphores + * killing performance on high-end SMP hardware. let's try to relax it using + * the following knowledge: + * 1) if buddy is referenced, it's already initialized + * 2) while block is used in buddy and the buddy is referenced, + * nobody can re-allocate that block + * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has + * bit set and PA claims same block, it's OK. IOW, one can set bit in + * on-disk bitmap if buddy has same bit set or/and PA covers corresponded + * block + * + * so, now we're building a concurrency table: + * - init buddy vs. + * - new PA + * blocks for PA are allocated in the buddy, buddy must be referenced + * until PA is linked to allocation group to avoid concurrent buddy init + * - use inode PA + * we need to make sure that either on-disk bitmap or PA has uptodate data + * given (3) we care that PA-=N operation doesn't interfere with init + * - discard inode PA + * the simplest way would be to have buddy initialized by the discard + * - use locality group PA + * again PA-=N must be serialized with init + * - discard locality group PA + * the simplest way would be to have buddy initialized by the discard + * - new PA vs. + * - use inode PA + * i_data_sem serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * some mutex should serialize them + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * - use inode PA + * - use inode PA + * i_data_sem or another mutex should serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * nothing wrong here -- they're different PAs covering different blocks + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * + * now we're ready to make few consequences: + * - PA is referenced and while it is no discard is possible + * - PA is referenced until block isn't marked in on-disk bitmap + * - PA changes only after on-disk bitmap + * - discard must not compete with init. either init is done before + * any discard or they're serialized somehow + * - buddy init as sum of on-disk bitmap and PAs is done atomically + * + * a special case when we've used PA to emptiness. no need to modify buddy + * in this case, but we should care about concurrent init + * + */ + + /* + * Logic in few words: + * + * - allocation: + * load group + * find blocks + * mark bits in on-disk bitmap + * release group + * + * - use preallocation: + * find proper PA (per-inode or group) + * load group + * mark bits in on-disk bitmap + * release group + * release PA + * + * - free: + * load group + * mark bits in on-disk bitmap + * release group + * + * - discard preallocations in group: + * mark PAs deleted + * move them onto local list + * load on-disk bitmap + * load group + * remove PA from object (inode or locality group) + * mark free blocks in-core + * + * - discard inode's preallocations: + */ + +/* + * Locking rules + * + * Locks: + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) + * + * Paths: + * - new pa + * object + * group + * + * - find and use pa: + * pa + * + * - release consumed pa: + * pa + * group + * object + * + * - generate in-core bitmap: + * group + * pa + * + * - discard all for given object (inode, locality group): + * object + * pa + * group + * + * - discard all for given group: + * group + * pa + * group + * object + * + */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_data_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES 8 +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); + +static inline void *mb_correct_addr_and_bit(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline int mb_test_bit(int bit, void *addr) +{ + /* + * ext4_test_bit on architecture like powerpc + * needs unsigned long aligned address + */ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_bit(bit, addr); +} + +static inline void mb_set_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_set_bit(bit, addr); +} + +static inline void mb_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + ext4_clear_bit(bit, addr); +} + +static inline int mb_find_next_zero_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static inline int mb_find_next_bit(void *addr, int max, int start) +{ + int fix = 0, ret, tmpmax; + addr = mb_correct_addr_and_bit(&fix, addr); + tmpmax = max + fix; + start += fix; + + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + +static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) +{ + char *bb; + + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); + BUG_ON(max == NULL); + + if (order > e4b->bd_blkbits + 1) { + *max = 0; + return NULL; + } + + /* at order 0 we see each particular block */ + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); + return e4b->bd_bitmap; + } + + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; + + return bb; +} + +#ifdef DOUBLE_CHECK +static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int i; + struct super_block *sb = e4b->bd_sb; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); + } + mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) +{ + int i; + + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + for (i = 0; i < count; i++) { + BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); + mb_set_bit(first + i, e4b->bd_info->bb_bitmap); + } +} + +static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { + unsigned char *b1, *b2; + int i; + b1 = (unsigned char *) e4b->bd_info->bb_bitmap; + b2 = (unsigned char *) bitmap; + for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { + if (b1[i] != b2[i]) { + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); + BUG(); + } + } + } +} + +#else +static inline void mb_free_blocks_double(struct inode *inode, + struct ext4_buddy *e4b, int first, int count) +{ + return; +} +static inline void mb_mark_used_double(struct ext4_buddy *e4b, + int first, int count) +{ + return; +} +static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) +{ + return; +} +#endif + +#ifdef AGGRESSIVE_CHECK + +#define MB_CHECK_ASSERT(assert) \ +do { \ + if (!(assert)) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + function, file, line, # assert); \ + BUG(); \ + } \ +} while (0) + +static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, + const char *function, int line) +{ + struct super_block *sb = e4b->bd_sb; + int order = e4b->bd_blkbits + 1; + int max; + int max2; + int i; + int j; + int k; + int count; + struct ext4_group_info *grp; + int fragments = 0; + int fstart; + struct list_head *cur; + void *buddy; + void *buddy2; + + { + static int mb_check_counter; + if (mb_check_counter++ % 100 != 0) + return 0; + } + + while (order > 1) { + buddy = mb_find_buddy(e4b, order, &max); + MB_CHECK_ASSERT(buddy); + buddy2 = mb_find_buddy(e4b, order - 1, &max2); + MB_CHECK_ASSERT(buddy2); + MB_CHECK_ASSERT(buddy != buddy2); + MB_CHECK_ASSERT(max * 2 == max2); + + count = 0; + for (i = 0; i < max; i++) { + + if (mb_test_bit(i, buddy)) { + /* only single bit in buddy2 may be 1 */ + if (!mb_test_bit(i << 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit((i<<1)+1, buddy2)); + } else if (!mb_test_bit((i << 1) + 1, buddy2)) { + MB_CHECK_ASSERT( + mb_test_bit(i << 1, buddy2)); + } + continue; + } + + /* both bits in buddy2 must be 1 */ + MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); + MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); + + for (j = 0; j < (1 << order); j++) { + k = (i * (1 << order)) + j; + MB_CHECK_ASSERT( + !mb_test_bit(k, e4b->bd_bitmap)); + } + count++; + } + MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); + order--; + } + + fstart = -1; + buddy = mb_find_buddy(e4b, 0, &max); + for (i = 0; i < max; i++) { + if (!mb_test_bit(i, buddy)) { + MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); + if (fstart == -1) { + fragments++; + fstart = i; + } + continue; + } + fstart = -1; + /* check used bits only */ + for (j = 0; j < e4b->bd_blkbits + 1; j++) { + buddy2 = mb_find_buddy(e4b, j, &max2); + k = i >> j; + MB_CHECK_ASSERT(k < max2); + MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); + } + } + MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); + MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); + + grp = ext4_get_group_info(sb, e4b->bd_group); + list_for_each(cur, &grp->bb_prealloc_list) { + ext4_group_t groupnr; + struct ext4_prealloc_space *pa; + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); + MB_CHECK_ASSERT(groupnr == e4b->bd_group); + for (i = 0; i < pa->pa_len; i++) + MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); + } + return 0; +} +#undef MB_CHECK_ASSERT +#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ + __FILE__, __func__, __LINE__) +#else +#define mb_check_buddy(e4b) +#endif + +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ +static void ext4_mb_mark_free_simple(struct super_block *sb, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, + struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; + unsigned short border; + + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); + + border = 2 << sb->s_blocksize_bits; + + while (len > 0) { + /* find how many blocks can be covered since this position */ + max = ffs(first | border) - 1; + + /* find how many blocks of power 2 we need to mark */ + min = fls(len) - 1; + + if (max < min) + min = max; + chunk = 1 << min; + + /* mark multiblock chunks only */ + grp->bb_counters[min]++; + if (min > 0) + mb_clear_bit(first >> min, + buddy + sbi->s_mb_offsets[min]); + + len -= chunk; + first += chunk; + } +} + +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; + unsigned free = 0; + unsigned fragments = 0; + unsigned long long period = get_cycles(); + + /* initialize buddy from bitmap which is aggregation + * of on-disk bitmap and preallocations */ + i = mb_find_next_zero_bit(bitmap, max, 0); + grp->bb_first_free = i; + while (i < max) { + fragments++; + first = i; + i = mb_find_next_bit(bitmap, max, i); + len = i - first; + free += len; + if (len > 1) + ext4_mb_mark_free_simple(sb, buddy, first, len, grp); + else + grp->bb_counters[0]++; + if (i < max) + i = mb_find_next_zero_bit(bitmap, max, i); + } + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { + ext4_grp_locked_error(sb, group, 0, 0, + "%u clusters in bitmap, %u in gd", + free, grp->bb_free); + /* + * If we intent to continue, we consider group descritor + * corrupt and update bb_free using bitmap value + */ + grp->bb_free = free; + } + mb_set_largest_free_order(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + spin_lock(&EXT4_SB(sb)->s_bal_lock); + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); +} + +/* The buddy information is attached the buddy cache inode + * for convenience. The information regarding each group + * is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are + * stored in the inode as + * + * { page } + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. + * So for each group we take up 2 blocks. A page can + * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. + * So it can have information regarding groups_per_page which + * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! + */ + +static int ext4_mb_init_cache(struct page *page, char *incore) +{ + ext4_group_t ngroups; + int blocksize; + int blocks_per_page; + int groups_per_page; + int err = 0; + int i; + ext4_group_t first_group, group; + int first_block; + struct super_block *sb; + struct buffer_head *bhs; + struct buffer_head **bh; + struct inode *inode; + char *data; + char *bitmap; + struct ext4_group_info *grinfo; + + mb_debug(1, "init page %lu\n", page->index); + + inode = page->mapping->host; + sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); + blocksize = 1 << inode->i_blkbits; + blocks_per_page = PAGE_CACHE_SIZE / blocksize; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + + /* allocate buffer_heads to read bitmaps */ + if (groups_per_page > 1) { + i = sizeof(struct buffer_head *) * groups_per_page; + bh = kzalloc(i, GFP_NOFS); + if (bh == NULL) { + err = -ENOMEM; + goto out; + } + } else + bh = &bhs; + + first_group = page->index * blocks_per_page / 2; + + /* read all groups the page covers into the cache */ + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) + break; + + grinfo = ext4_get_group_info(sb, group); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; + continue; + } + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; + goto out; + } + mb_debug(1, "read bitmap for group %u\n", group); + } + + /* wait for I/O completion */ + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; + goto out; + } + } + + first_block = page->index * blocks_per_page; + for (i = 0; i < blocks_per_page; i++) { + int group; + + group = (first_block + i) >> 1; + if (group >= ngroups) + break; + + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + + /* + * data carry information regarding this + * particular group in the format specified + * above + * + */ + data = page_address(page) + (i * blocksize); + bitmap = bh[group - first_group]->b_data; + + /* + * We place the buddy block and bitmap block + * close together + */ + if ((first_block + i) & 1) { + /* this is block of buddy */ + BUG_ON(incore == NULL); + mb_debug(1, "put buddy for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); + grinfo = ext4_get_group_info(sb, group); + grinfo->bb_fragments = 0; + memset(grinfo->bb_counters, 0, + sizeof(*grinfo->bb_counters) * + (sb->s_blocksize_bits+2)); + /* + * incore got set to the group block bitmap below + */ + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); + ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { + /* this is block of bitmap */ + BUG_ON(incore != NULL); + mb_debug(1, "put bitmap for group %u in page %lu/%x\n", + group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); + + /* see comments in ext4_mb_put_pa() */ + ext4_lock_group(sb, group); + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ + ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + + /* set incore so that the buddy information can be + * generated using this + */ + incore = data; + } + } + SetPageUptodate(page); + +out: + if (bh) { + for (i = 0; i < groups_per_page; i++) + brelse(bh[i]); + if (bh != &bhs) + kfree(bh); + } + return err; +} + +/* + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + */ +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; + int blocks_per_page; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; + } + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -EIO; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; +} + +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); + } +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + struct ext4_group_info *this_grp; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; + + mb_debug(1, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have pinned buddy page to page cache. + */ + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + goto err; + } + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + + if (e4b.bd_buddy_page == NULL) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + ret = 0; + goto err; + } + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_page_lock(&e4b); + return ret; +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack int +ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +{ + int blocks_per_page; + int block; + int pnum; + int poff; + struct page *page; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + + mb_debug(1, "load group %u\n", group); + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); + + e4b->bd_blkbits = sb->s_blocksize_bits; + e4b->bd_info = grp; + e4b->bd_sb = sb; + e4b->bd_group = group; + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + } + + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + /* we could use find_or_create_page(), but it locks page + * what we'd like to avoid in fast path ... */ + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + mb_cmp_bitmaps(e4b, page_address(page) + + (poff * sb->s_blocksize)); + } + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + + page = find_get_page(inode->i_mapping, pnum); + if (page == NULL || !PageUptodate(page)) { + if (page) + page_cache_release(page); + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } + unlock_page(page); + } + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + e4b->bd_buddy_page = page; + e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + mark_page_accessed(page); + + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + return 0; + +err: + if (page) + page_cache_release(page); + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; + return ret; +} + +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); + if (e4b->bd_buddy_page) + page_cache_release(e4b->bd_buddy_page); +} + + +static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) +{ + int order = 1; + void *bb; + + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); + BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); + + bb = e4b->bd_buddy; + while (order <= e4b->bd_blkbits + 1) { + block = block >> 1; + if (!mb_test_bit(block, bb)) { + /* this block is part of buddy of order 'order' */ + return order; + } + bb += 1 << (e4b->bd_blkbits - order); + order++; + } + return 0; +} + +static void mb_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + *addr = 0; + cur += 32; + continue; + } + mb_clear_bit(cur, bm); + cur++; + } +} + +void ext4_set_bits(void *bm, int cur, int len) +{ + __u32 *addr; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: set whole word at once */ + addr = bm + (cur >> 3); + *addr = 0xffffffff; + cur += 32; + continue; + } + mb_set_bit(cur, bm); + cur++; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int block = 0; + int max = 0; + int order; + void *buddy; + void *buddy2; + struct super_block *sb = e4b->bd_sb; + + BUG_ON(first + count > (sb->s_blocksize << 3)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_free_blocks_double(inode, e4b, first, count); + + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; + + /* let's maintain fragments counter */ + if (first != 0) + block = !mb_test_bit(first - 1, e4b->bd_bitmap); + if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) + max = !mb_test_bit(first + count, e4b->bd_bitmap); + if (block && max) + e4b->bd_info->bb_fragments--; + else if (!block && !max) + e4b->bd_info->bb_fragments++; + + /* let's maintain buddy itself */ + while (count-- > 0) { + block = first++; + order = 0; + + if (!mb_test_bit(block, e4b->bd_bitmap)) { + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); + } + mb_clear_bit(block, e4b->bd_bitmap); + e4b->bd_info->bb_counters[order]++; + + /* start of the buddy */ + buddy = mb_find_buddy(e4b, order, &max); + + do { + block &= ~1UL; + if (mb_test_bit(block, buddy) || + mb_test_bit(block + 1, buddy)) + break; + + /* both the buddies are free, try to coalesce them */ + buddy2 = mb_find_buddy(e4b, order + 1, &max); + + if (!buddy2) + break; + + if (order > 0) { + /* for special purposes, we don't set + * free bits in bitmap */ + mb_set_bit(block, buddy); + mb_set_bit(block + 1, buddy); + } + e4b->bd_info->bb_counters[order]--; + e4b->bd_info->bb_counters[order]--; + + block = block >> 1; + order++; + e4b->bd_info->bb_counters[order]++; + + mb_clear_bit(block, buddy2); + buddy = buddy2; + } while (1); + } + mb_set_largest_free_order(sb, e4b->bd_info); + mb_check_buddy(e4b); +} + +static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, + int needed, struct ext4_free_extent *ex) +{ + int next = block; + int max; + void *buddy; + + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + BUG_ON(ex == NULL); + + buddy = mb_find_buddy(e4b, order, &max); + BUG_ON(buddy == NULL); + BUG_ON(block >= max); + if (mb_test_bit(block, buddy)) { + ex->fe_len = 0; + ex->fe_start = 0; + ex->fe_group = 0; + return 0; + } + + /* FIXME dorp order completely ? */ + if (likely(order == 0)) { + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; + } + + ex->fe_len = 1 << order; + ex->fe_start = block << order; + ex->fe_group = e4b->bd_group; + + /* calc difference from given start */ + next = next - ex->fe_start; + ex->fe_len -= next; + ex->fe_start += next; + + while (needed > ex->fe_len && + (buddy = mb_find_buddy(e4b, order, &max))) { + + if (block + 1 >= max) + break; + + next = (block + 1) * (1 << order); + if (mb_test_bit(next, e4b->bd_bitmap)) + break; + + order = mb_find_order_for_block(e4b, next); + + block = next >> order; + ex->fe_len += 1 << order; + } + + BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); + return ex->fe_len; +} + +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) +{ + int ord; + int mlen = 0; + int max = 0; + int cur; + int start = ex->fe_start; + int len = ex->fe_len; + unsigned ret = 0; + int len0 = len; + void *buddy; + + BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); + BUG_ON(e4b->bd_group != ex->fe_group); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); + mb_check_buddy(e4b); + mb_mark_used_double(e4b, start, len); + + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; + + /* let's maintain fragments counter */ + if (start != 0) + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); + if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) + max = !mb_test_bit(start + len, e4b->bd_bitmap); + if (mlen && max) + e4b->bd_info->bb_fragments++; + else if (!mlen && !max) + e4b->bd_info->bb_fragments--; + + /* let's maintain buddy itself */ + while (len) { + ord = mb_find_order_for_block(e4b, start); + + if (((start >> ord) << ord) == start && len >= (1 << ord)) { + /* the whole chunk may be allocated at once! */ + mlen = 1 << ord; + buddy = mb_find_buddy(e4b, ord, &max); + BUG_ON((start >> ord) >= max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + start += mlen; + len -= mlen; + BUG_ON(len < 0); + continue; + } + + /* store for history */ + if (ret == 0) + ret = len | (ord << 16); + + /* we have to split large buddy */ + BUG_ON(ord <= 0); + buddy = mb_find_buddy(e4b, ord, &max); + mb_set_bit(start >> ord, buddy); + e4b->bd_info->bb_counters[ord]--; + + ord--; + cur = (start >> ord) & ~1U; + buddy = mb_find_buddy(e4b, ord, &max); + mb_clear_bit(cur, buddy); + mb_clear_bit(cur + 1, buddy); + e4b->bd_info->bb_counters[ord]++; + e4b->bd_info->bb_counters[ord]++; + } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); + mb_check_buddy(e4b); + + return ret; +} + +/* + * Must be called under group lock! + */ +static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int ret; + + BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); + ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; + ret = mb_mark_used(e4b, &ac->ac_b_ex); + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + ac->ac_status = AC_STATUS_FOUND; + ac->ac_tail = ret & 0xffff; + ac->ac_buddy = ret >> 16; + + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ + ac->ac_bitmap_page = e4b->bd_bitmap_page; + get_page(ac->ac_bitmap_page); + ac->ac_buddy_page = e4b->bd_buddy_page; + get_page(ac->ac_buddy_page); + /* store last allocated for subsequent stream allocation */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + spin_lock(&sbi->s_md_lock); + sbi->s_mb_last_group = ac->ac_f_ex.fe_group; + sbi->s_mb_last_start = ac->ac_f_ex.fe_start; + spin_unlock(&sbi->s_md_lock); + } +} + +/* + * regular allocator, for general purposes allocation + */ + +static void ext4_mb_check_limits(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b, + int finish_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + struct ext4_free_extent ex; + int max; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + /* + * We don't want to scan for a whole year + */ + if (ac->ac_found > sbi->s_mb_max_to_scan && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + ac->ac_status = AC_STATUS_BREAK; + return; + } + + /* + * Haven't found good chunk so far, let's continue + */ + if (bex->fe_len < gex->fe_len) + return; + + if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) + && bex->fe_group == e4b->bd_group) { + /* recheck chunk's availability - we don't know + * when it was found (within this lock-unlock + * period or not) */ + max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + if (max >= gex->fe_len) { + ext4_mb_use_best_found(ac, e4b); + return; + } + } +} + +/* + * The routine checks whether found extent is good enough. If it is, + * then the extent gets marked used and flag is set to the context + * to stop scanning. Otherwise, the extent is compared with the + * previous found extent and if new one is better, then it's stored + * in the context. Later, the best found extent will be used, if + * mballoc can't find good enough extent. + * + * FIXME: real allocation policy is to be designed yet! + */ +static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, + struct ext4_free_extent *ex, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent *bex = &ac->ac_b_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + BUG_ON(ex->fe_len <= 0); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); + + ac->ac_found++; + + /* + * The special case - take what you catch first + */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * Let's check whether the chuck is good enough + */ + if (ex->fe_len == gex->fe_len) { + *bex = *ex; + ext4_mb_use_best_found(ac, e4b); + return; + } + + /* + * If this is first found extent, just store it in the context + */ + if (bex->fe_len == 0) { + *bex = *ex; + return; + } + + /* + * If new found extent is better, store it in the context + */ + if (bex->fe_len < gex->fe_len) { + /* if the request isn't satisfied, any found extent + * larger than previous best one is better */ + if (ex->fe_len > bex->fe_len) + *bex = *ex; + } else if (ex->fe_len > gex->fe_len) { + /* if the request is satisfied, then we try to find + * an extent that still satisfy the request, but is + * smaller than previous one */ + if (ex->fe_len < bex->fe_len) + *bex = *ex; + } + + ext4_mb_check_limits(ac, e4b, 0); +} + +static noinline_for_stack +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex = ac->ac_b_ex; + ext4_group_t group = ex.fe_group; + int max; + int err; + + BUG_ON(ex.fe_len <= 0); + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + + if (max > 0) { + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + ext4_group_t group = ac->ac_g_ex.fe_group; + int max; + int err; + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_free_extent ex; + + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + return 0; + + err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); + if (err) + return err; + + ext4_lock_group(ac->ac_sb, group); + max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + ac->ac_g_ex.fe_len, &ex); + + if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + ext4_fsblk_t start; + + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; + /* use do_div to get remainder (would be 64-bit modulo) */ + if (do_div(start, sbi->s_stripe) == 0) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + } else if (max >= ac->ac_g_ex.fe_len) { + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { + /* Sometimes, caller may want to merge even small + * number of blocks to an existing extent */ + BUG_ON(ex.fe_len <= 0); + BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); + BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + } + ext4_unlock_group(ac->ac_sb, group); + ext4_mb_unload_buddy(e4b); + + return 0; +} + +/* + * The routine scans buddy structures (not bitmap!) from given order + * to max order and tries to find big enough chunk to satisfy the req + */ +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_group_info *grp = e4b->bd_info; + void *buddy; + int i; + int k; + int max; + + BUG_ON(ac->ac_2order <= 0); + for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + if (grp->bb_counters[i] == 0) + continue; + + buddy = mb_find_buddy(e4b, i, &max); + BUG_ON(buddy == NULL); + + k = mb_find_next_zero_bit(buddy, max, 0); + BUG_ON(k >= max); + + ac->ac_found++; + + ac->ac_b_ex.fe_len = 1 << i; + ac->ac_b_ex.fe_start = k << i; + ac->ac_b_ex.fe_group = e4b->bd_group; + + ext4_mb_use_best_found(ac, e4b); + + BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + + if (EXT4_SB(sb)->s_mb_stats) + atomic_inc(&EXT4_SB(sb)->s_bal_2orders); + + break; + } +} + +/* + * The routine scans the group and measures all found extents. + * In order to optimize scanning, caller must pass number of + * free blocks in the group, so the routine can know upper limit. + */ +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + void *bitmap = e4b->bd_bitmap; + struct ext4_free_extent ex; + int i; + int free; + + free = e4b->bd_info->bb_free; + BUG_ON(free <= 0); + + i = e4b->bd_info->bb_first_free; + + while (free && ac->ac_status == AC_STATUS_CONTINUE) { + i = mb_find_next_zero_bit(bitmap, + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { + /* + * IF we have corrupt bitmap, we won't find any + * free blocks even though group info says we + * we have free blocks + */ + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But bitmap says 0", + free); + break; + } + + mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + BUG_ON(ex.fe_len <= 0); + if (free < ex.fe_len) { + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But got %d blocks", + free, ex.fe_len); + /* + * The number of free blocks differs. This mostly + * indicate that the bitmap is corrupt. So exit + * without claiming the space. + */ + break; + } + + ext4_mb_measure_extent(ac, &ex, e4b); + + i += ex.fe_len; + free -= ex.fe_len; + } + + ext4_mb_check_limits(ac, e4b, 1); +} + +/* + * This is a special case for storages like raid5 + * we try to find stripe-aligned chunks for stripe-size-multiple requests + */ +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, + struct ext4_buddy *e4b) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + void *bitmap = e4b->bd_bitmap; + struct ext4_free_extent ex; + ext4_fsblk_t first_group_block; + ext4_fsblk_t a; + ext4_grpblk_t i; + int max; + + BUG_ON(sbi->s_stripe == 0); + + /* find first stripe-aligned block in group */ + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + + a = first_group_block + sbi->s_stripe - 1; + do_div(a, sbi->s_stripe); + i = (a * sbi->s_stripe) - first_group_block; + + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { + if (!mb_test_bit(i, bitmap)) { + max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + if (max >= sbi->s_stripe) { + ac->ac_found++; + ac->ac_b_ex = ex; + ext4_mb_use_best_found(ac, e4b); + break; + } + } + i += sbi->s_stripe; + } +} + +/* This is now called BEFORE we load the buddy bitmap. */ +static int ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + unsigned free, fragments; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + + BUG_ON(cr < 0 || cr >= 4); + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } + + free = grp->bb_free; + fragments = grp->bb_fragments; + if (free == 0) + return 0; + if (fragments == 0) + return 0; + + switch (cr) { + case 0: + BUG_ON(ac->ac_2order == 0); + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return 0; + + return 1; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + break; + case 2: + if (free >= ac->ac_g_ex.fe_len) + return 1; + break; + case 3: + return 1; + default: + BUG(); + } + + return 0; +} + +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +{ + ext4_group_t ngroups, group, i; + int cr; + int err = 0; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + /* first, try the goal */ + err = ext4_mb_find_by_goal(ac, &e4b); + if (err || ac->ac_status == AC_STATUS_FOUND) + goto out; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + goto out; + + /* + * ac->ac2_order is set only if the fe_len is a power of 2 + * if ac2_order is set we also set criteria to 0 so that we + * try exact allocation using buddy. + */ + i = fls(ac->ac_g_ex.fe_len); + ac->ac_2order = 0; + /* + * We search using buddy data only if the order of the request + * is greater than equal to the sbi_s_mb_order2_reqs + * You can tune it via /sys/fs/ext4//mb_order2_req + */ + if (i >= sbi->s_mb_order2_reqs) { + /* + * This should tell if fe_len is exactly power of 2 + */ + if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + ac->ac_2order = i - 1; + } + + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); + ac->ac_g_ex.fe_group = sbi->s_mb_last_group; + ac->ac_g_ex.fe_start = sbi->s_mb_last_start; + spin_unlock(&sbi->s_md_lock); + } + + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; + /* + * cr == 0 try to get exact allocation, + * cr == 3 try to get anything + */ +repeat: + for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + ac->ac_criteria = cr; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + + for (i = 0; i < ngroups; group++, i++) { + if (group == ngroups) + group = 0; + + /* This now checks without needing the buddy page */ + if (!ext4_mb_good_group(ac, group, cr)) + continue; + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) + goto out; + + ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ + if (!ext4_mb_good_group(ac, group, cr)) { + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ac->ac_groups_scanned++; + if (cr == 0) + ext4_mb_simple_scan_group(ac, &e4b); + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) + ext4_mb_scan_aligned(ac, &e4b); + else + ext4_mb_complex_scan_group(ac, &e4b); + + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + if (ac->ac_status != AC_STATUS_CONTINUE) + break; + } + } + + if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + /* + * We've been searching too long. Let's try to allocate + * the best chunk we've found so far + */ + + ext4_mb_try_best_found(ac, &e4b); + if (ac->ac_status != AC_STATUS_FOUND) { + /* + * Someone more lucky has already allocated it. + * The only thing we can do is just take first + * found block(s) + printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); + */ + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_flags |= EXT4_MB_HINT_FIRST; + cr = 3; + atomic_inc(&sbi->s_mb_lost_chunks); + goto repeat; + } + } +out: + return err; +} + +static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = seq->private; + ext4_group_t group; + + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = seq->private; + ext4_group_t group; + + ++*pos; + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) + return NULL; + group = *pos + 1; + return (void *) ((unsigned long) group); +} + +static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = seq->private; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err; + struct ext4_buddy e4b; + struct sg { + struct ext4_group_info info; + ext4_grpblk_t counters[16]; + } sg; + + group--; + if (group == 0) + seq_printf(seq, "#%-5s: %-5s %-5s %-5s " + "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " + "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", + "group", "free", "frags", "first", + "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", + "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + + i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext4_group_info); + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, + sg.info.bb_fragments, sg.info.bb_first_free); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); + seq_printf(seq, " ]\n"); + + return 0; +} + +static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_mb_seq_groups_ops = { + .start = ext4_mb_seq_groups_start, + .next = ext4_mb_seq_groups_next, + .stop = ext4_mb_seq_groups_stop, + .show = ext4_mb_seq_groups_show, +}; + +static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) +{ + struct super_block *sb = PDE(inode)->data; + int rc; + + rc = seq_open(file, &ext4_mb_seq_groups_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = sb; + } + return rc; + +} + +static const struct file_operations ext4_mb_seq_groups_fops = { + .owner = THIS_MODULE, + .open = ext4_mb_seq_groups_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} + +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) +{ + int i; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate mem " + "for a buddy group"); + goto exit_meta_group_info; + } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; + } + + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); + goto exit_group_info; + } + memset(meta_group_info[i], 0, kmem_cache_size(cachep)); + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); + + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_clusters_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + ext4_free_group_clusters(sb, desc); + } + + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); + } +#endif + + return 0; + +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ + +static int ext4_mb_init_backend(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; + struct ext4_group_desc *desc; + struct kmem_cache *cachep; + + /* This is the number of blocks used by GDT */ + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + + /* + * This is the total number of blocks used by GDT including + * the number of reserved blocks for GDT. + * The s_group_info array is allocated with this value + * to allow a clean online resize without a complex + * manipulation of pointer. + * The drawback is the unused memory when no resize + * occurs but it's very low in terms of pages + * (see comments below) + * Need to handle this properly when META_BG resizing is allowed + */ + num_meta_group_infos_max = num_meta_group_infos + + le16_to_cpu(es->s_reserved_gdt_blocks); + + /* + * array_size is the size of s_group_info array. We round it + * to the next power of two because this approximation is done + * internally by kmalloc so we can have some more memory + * for free here (e.g. may be used for META_BG resize). + */ + array_size = 1; + while (array_size < sizeof(*sbi->s_group_info) * + num_meta_group_infos_max) + array_size = array_size << 1; + /* An 8TB filesystem with 64-bit pointers requires a 4096 byte + * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. + * So a two level scheme suffices for now. */ + sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); + if (sbi->s_group_info == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + sbi->s_buddy_cache = new_inode(sb); + if (sbi->s_buddy_cache == NULL) { + ext4_msg(sb, KERN_ERR, "can't get new inode"); + goto err_freesgi; + } + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); + goto err_freebuddy; + } + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; + } + + return 0; + +err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); + while (i-- > 0) + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + i = num_meta_group_infos; + while (i-- > 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); +err_freesgi: + ext4_kvfree(sbi->s_group_info); + return -ENOMEM; +} + +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + ext4_groupinfo_caches[cache_index] = cachep; + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + return 0; +} + +int ext4_mb_init(struct super_block *sb, int needs_recovery) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned i, j; + unsigned offset; + unsigned max; + int ret; + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + + sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_offsets == NULL) { + ret = -ENOMEM; + goto out; + } + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); + if (sbi->s_mb_maxs == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; + + /* order 0 is regular bitmap */ + sbi->s_mb_maxs[0] = sb->s_blocksize << 3; + sbi->s_mb_offsets[0] = 0; + + i = 1; + offset = 0; + max = sb->s_blocksize << 2; + do { + sbi->s_mb_offsets[i] = offset; + sbi->s_mb_maxs[i] = max; + offset += 1 << (sb->s_blocksize_bits - i); + max = max >> 1; + i++; + } while (i <= sb->s_blocksize_bits + 1); + + spin_lock_init(&sbi->s_md_lock); + spin_lock_init(&sbi->s_bal_lock); + + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; + sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); + if (sbi->s_locality_groups == NULL) { + ret = -ENOMEM; + goto out_free_groupinfo_slab; + } + for_each_possible_cpu(i) { + struct ext4_locality_group *lg; + lg = per_cpu_ptr(sbi->s_locality_groups, i); + mutex_init(&lg->lg_mutex); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); + spin_lock_init(&lg->lg_prealloc_lock); + } + + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out_free_locality_groups; + + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); + + return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out_free_groupinfo_slab: + ext4_groupinfo_destroy_slabs(); +out: + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; + return ret; +} + +/* need to called with the ext4 group lock held */ +static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +{ + struct ext4_prealloc_space *pa; + struct list_head *cur, *tmp; + int count = 0; + + list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + list_del(&pa->pa_group_list); + count++; + kmem_cache_free(ext4_pspace_cachep, pa); + } + if (count) + mb_debug(1, "mballoc: %u PAs left\n", count); + +} + +int ext4_mb_release(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + int num_meta_group_infos; + struct ext4_group_info *grinfo; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + + if (sbi->s_group_info) { + for (i = 0; i < ngroups; i++) { + grinfo = ext4_get_group_info(sb, i); +#ifdef DOUBLE_CHECK + kfree(grinfo->bb_bitmap); +#endif + ext4_lock_group(sb, i); + ext4_mb_cleanup_pa(grinfo); + ext4_unlock_group(sb, i); + kmem_cache_free(cachep, grinfo); + } + num_meta_group_infos = (ngroups + + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + for (i = 0; i < num_meta_group_infos; i++) + kfree(sbi->s_group_info[i]); + ext4_kvfree(sbi->s_group_info); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); + if (sbi->s_buddy_cache) + iput(sbi->s_buddy_cache); + if (sbi->s_mb_stats) { + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", + atomic_read(&sbi->s_bal_allocated), + atomic_read(&sbi->s_bal_reqs), + atomic_read(&sbi->s_bal_success)); + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", + atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_goals), + atomic_read(&sbi->s_bal_2orders), + atomic_read(&sbi->s_bal_breaks), + atomic_read(&sbi->s_mb_lost_chunks)); + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, + sbi->s_mb_generation_time); + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", + atomic_read(&sbi->s_mb_preallocated), + atomic_read(&sbi->s_mb_discarded)); + } + + free_percpu(sbi->s_locality_groups); + + return 0; +} + +static inline int ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t cluster, int count) +{ + ext4_fsblk_t discard_block; + + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); +} + +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) +{ + struct ext4_free_data *entry = (struct ext4_free_data *)jce; + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); + + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, entry->efd_count); + + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); + + + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); + + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); + + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); +} + +#ifdef CONFIG_EXT4_DEBUG +u8 mb_enable_debug __read_mostly; + +static struct dentry *debugfs_dir; +static struct dentry *debugfs_debug; + +static void __init ext4_create_debugfs_entry(void) +{ + debugfs_dir = debugfs_create_dir("ext4", NULL); + if (debugfs_dir) + debugfs_debug = debugfs_create_u8("mballoc-debug", + S_IRUGO | S_IWUSR, + debugfs_dir, + &mb_enable_debug); +} + +static void ext4_remove_debugfs_entry(void) +{ + debugfs_remove(debugfs_debug); + debugfs_remove(debugfs_dir); +} + +#else + +static void __init ext4_create_debugfs_entry(void) +{ +} + +static void ext4_remove_debugfs_entry(void) +{ +} + +#endif + +int __init ext4_init_mballoc(void) +{ + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); + if (ext4_pspace_cachep == NULL) + return -ENOMEM; + + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); + if (ext4_ac_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + return -ENOMEM; + } + + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } + ext4_create_debugfs_entry(); + return 0; +} + +void ext4_exit_mballoc(void) +{ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + kmem_cache_destroy(ext4_free_data_cachep); + ext4_groupinfo_destroy_slabs(); + ext4_remove_debugfs_entry(); +} + + +/* + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps + * Returns 0 if success or error code + */ +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + handle_t *handle, unsigned int reserv_clstrs) +{ + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block; + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + + err = -EIO; + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); + if (!bitmap_bh) + goto out_err; + + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto out_err; + + err = -EIO; + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + if (!gdp) + goto out_err; + + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, + ext4_free_group_clusters(sb, gdp)); + + err = ext4_journal_get_write_access(handle, gdp_bh); + if (err) + goto out_err; + + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (!ext4_data_block_valid(sbi, block, len)) { + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " + "fs metadata", block, block+len); + /* File system mounted not to panic on error + * Fix the bitmap and repeat the block allocation + * We leak some of the blocks here. + */ + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!err) + err = -EAGAIN; + goto out_err; + } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < ac->ac_b_ex.fe_len; i++) { + BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, + bitmap_bh->b_data)); + } + } +#endif + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); + } + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); + gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + atomic_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + +out_err: + ext4_mark_super_dirty(sb); + brelse(bitmap_bh); + return err; +} + +/* + * here we normalize request for locality group + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via + * /sys/fs/ext4//mb_group_prealloc + * + * XXX: should we try to preallocate more than the group has now? + */ +static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + + BUG_ON(lg == NULL); + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + mb_debug(1, "#%u: goal %u blocks for locality group\n", + current->pid, ac->ac_g_ex.fe_len); +} + +/* + * Normalization means making request better in terms of + * size and alignment + */ +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits, max; + ext4_lblk_t end; + loff_t size, start_off; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; + + /* do normalize only data requests, metadata requests + do not need preallocation */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + /* sometime caller may want exact blocks */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + /* caller may indicate that preallocation isn't + * required (it's a tail, for example) */ + if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) + return; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { + ext4_mb_normalize_group_request(ac); + return ; + } + + bsbits = ac->ac_sb->s_blocksize_bits; + + /* first, let's learn actual file size + * given current request is allocated */ + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); + orig_size = size; + + /* max size of free chunks */ + max = 2 << bsbits; + +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) + + /* first, try to predict filesize */ + /* XXX: should this table be tunable? */ + start_off = 0; + if (size <= 16 * 1024) { + size = 16 * 1024; + } else if (size <= 32 * 1024) { + size = 32 * 1024; + } else if (size <= 64 * 1024) { + size = 64 * 1024; + } else if (size <= 128 * 1024) { + size = 128 * 1024; + } else if (size <= 256 * 1024) { + size = 256 * 1024; + } else if (size <= 512 * 1024) { + size = 512 * 1024; + } else if (size <= 1024 * 1024) { + size = 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (22 - bsbits)) << 22; + size = 4 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + (8<<20)>>bsbits, max, 8 * 1024)) { + start_off = ((loff_t)ac->ac_o_ex.fe_logical >> + (23 - bsbits)) << 23; + size = 8 * 1024 * 1024; + } else { + start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; + size = ac->ac_o_ex.fe_len << bsbits; + } + size = size >> bsbits; + start = start_off >> bsbits; + + /* don't cover already allocated blocks in selected range */ + if (ar->pleft && start <= ar->lleft) { + size -= ar->lleft + 1 - start; + start = ar->lleft + 1; + } + if (ar->pright && start + size - 1 >= ar->lright) + size -= start + size - ar->lright; + + end = start + size; + + /* check we don't cross already preallocated blocks */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + + if (pa->pa_deleted) + continue; + spin_lock(&pa->pa_lock); + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); + + /* PA must not overlap original request */ + BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || + ac->ac_o_ex.fe_logical < pa->pa_lstart)); + + /* skip PAs this normalized request doesn't overlap with */ + if (pa->pa_lstart >= end || pa_end <= start) { + spin_unlock(&pa->pa_lock); + continue; + } + BUG_ON(pa->pa_lstart <= start && pa_end >= end); + + /* adjust start or end to be adjacent to this pa */ + if (pa_end <= ac->ac_o_ex.fe_logical) { + BUG_ON(pa_end < start); + start = pa_end; + } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + BUG_ON(pa->pa_lstart > end); + end = pa->pa_lstart; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + size = end - start; + + /* XXX: extra loop to check we really don't overlap preallocations */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0) { + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); + BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + if (start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical) { + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); + } + BUG_ON(start + size <= ac->ac_o_ex.fe_logical && + start > ac->ac_o_ex.fe_logical); + BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + + /* XXX: is it better to align blocks WRT to logical + * placement or satisfy big request as is */ + ac->ac_g_ex.fe_logical = start; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); + + /* define goal start in order to merge */ + if (ar->pright && (ar->lright == (start + size))) { + /* merge to the right */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + if (ar->pleft && (ar->lleft + 1 == start)) { + /* merge to the left */ + ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, + &ac->ac_f_ex.fe_group, + &ac->ac_f_ex.fe_start); + ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; + } + + mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, + (unsigned) orig_size, (unsigned) start); +} + +static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + atomic_inc(&sbi->s_bal_reqs); + atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) + atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && + ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) + atomic_inc(&sbi->s_bal_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) + atomic_inc(&sbi->s_bal_breaks); + } + + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); +} + +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + int len; + + if (pa && pa->pa_type == MB_INODE_PA) { + len = ac->ac_b_ex.fe_len; + pa->pa_free += len; + } + +} + +/* + * use blocks preallocated to inode + */ +static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + ext4_fsblk_t end; + int len; + + /* found preallocated blocks, use them */ + start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); + ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + BUG_ON(start < pa->pa_pstart); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); + BUG_ON(pa->pa_free < len); + pa->pa_free -= len; + + mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); +} + +/* + * use blocks preallocated to locality group + */ +static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + unsigned int len = ac->ac_o_ex.fe_len; + + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, + &ac->ac_b_ex.fe_group, + &ac->ac_b_ex.fe_start); + ac->ac_b_ex.fe_len = len; + ac->ac_status = AC_STATUS_FOUND; + ac->ac_pa = pa; + + /* we don't correct pa_pstart or pa_plen here to avoid + * possible race when the group is being loaded concurrently + * instead we correct pa later, after blocks are marked + * in on-disk bitmap -- see ext4_mb_release_context() + * Other CPUs are prevented from allocating from this pa by lg_mutex + */ + mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); +} + +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance <= new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* + * search goal blocks in preallocated space + */ +static noinline_for_stack int +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int order, i; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; + + /* only data can be preallocated */ + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return 0; + + /* first, try per-file preallocation */ + rcu_read_lock(); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + + /* all fields in this condition don't change, + * so we can skip locking for them */ + if (ac->ac_o_ex.fe_logical < pa->pa_lstart || + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) + continue; + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) + continue; + + /* found preallocated blocks, use them */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && pa->pa_free) { + atomic_inc(&pa->pa_count); + ext4_mb_use_inode_pa(ac, pa); + spin_unlock(&pa->pa_lock); + ac->ac_criteria = 10; + rcu_read_unlock(); + return 1; + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + + /* can we use group allocation? */ + if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) + return 0; + + /* inode may have no locality group for some reason */ + lg = ac->ac_lg; + if (lg == NULL) + return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } + spin_unlock(&pa->pa_lock); + } + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } + return 0; +} + +/* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with the ext4 group lock held + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); + n = rb_next(n); + } + return; +} + +/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; + int len; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here + * notice we do NOT ignore preallocations with pa_deleted + * otherwise we could leave used blocks available for + * allocation in buddy when concurrent ext4_mb_put_pa() + * is dropping preallocation + */ + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); + if (unlikely(len == 0)) + continue; + BUG_ON(groupnr != group); + ext4_set_bits(bitmap, start, len); + preallocated += len; + } + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); +} + +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + kmem_cache_free(ext4_pspace_cachep, pa); +} + +/* + * drops a reference to preallocated space descriptor + * if this was the last reference and the space is consumed + */ +static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + struct super_block *sb, struct ext4_prealloc_space *pa) +{ + ext4_group_t grp; + ext4_fsblk_t grp_blk; + + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) + return; + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } + + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + grp_blk = pa->pa_pstart; + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + + /* + * possible race: + * + * P1 (buddy init) P2 (regular allocation) + * find block B in PA + * copy on-disk bitmap to buddy + * mark B in on-disk bitmap + * drop PA from group + * mark all PAs in buddy + * + * thus, P1 initializes buddy with B available. to prevent this + * we make "copy" and "mark all PAs" atomic and serialize "drop PA" + * against that pair + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); +} + +/* + * creates new preallocated space for given inode + */ +static noinline_for_stack int +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + struct ext4_inode_info *ei; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + int winl; + int wins; + int win; + int offs; + + /* we can't allocate as much as normalizer wants. + * so, found space must get proper lstart + * to cover original request */ + BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); + BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); + + /* we're limited by original request in that + * logical block must be covered any way + * winl is window we can move our chunk within */ + winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + + /* also, we should cover whole original request */ + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); + + /* the smallest one defines real window */ + win = min(winl, wins); + + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (offs && offs < win) + win = offs; + + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_B2C(sbi, win); + BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); + BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + } + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_lstart = ac->ac_b_ex.fe_logical; + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; + + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); + + ext4_mb_use_inode_pa(ac, pa); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); + + ei = EXT4_I(ac->ac_inode); + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + + pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_inode = ac->ac_inode; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); + list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_obj_lock); + + return 0; +} + +/* + * creates new preallocated space for locality group inodes belongs to + */ +static noinline_for_stack int +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg; + struct ext4_prealloc_space *pa; + struct ext4_group_info *grp; + + /* preallocate only when found space is larger then requested */ + BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); + BUG_ON(ac->ac_status != AC_STATUS_FOUND); + BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); + if (pa == NULL) + return -ENOMEM; + + /* preallocation can change ac_b_ex, thus we store actually + * allocated blocks for history */ + ac->ac_f_ex = ac->ac_b_ex; + + pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + pa->pa_lstart = pa->pa_pstart; + pa->pa_len = ac->ac_b_ex.fe_len; + pa->pa_free = pa->pa_len; + atomic_set(&pa->pa_count, 1); + spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; + + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); + + ext4_mb_use_group_pa(ac, pa); + atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + + grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); + lg = ac->ac_lg; + BUG_ON(lg == NULL); + + pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_inode = NULL; + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ + return 0; +} + +static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +{ + int err; + + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + err = ext4_mb_new_group_pa(ac); + else + err = ext4_mb_new_inode_pa(ac); + return err; +} + +/* + * finds all unused blocks in on-disk bitmap, frees them in + * in-core bitmap and buddy. + * @pa must be unlinked from inode and group lists, so that + * nobody else can find/use it. + * the caller MUST hold group/inode locks. + * TODO: optimize the case when there are no in-core structures yet + */ +static noinline_for_stack int +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int end; + unsigned int next; + ext4_group_t group; + ext4_grpblk_t bit; + unsigned long long grp_blk_start; + int err = 0; + int free = 0; + + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + end = bit + pa->pa_len; + + while (bit < end) { + bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); + if (bit >= end) + break; + next = mb_find_next_bit(bitmap_bh->b_data, end, bit); + mb_debug(1, " free preallocated %u/%u in group %u\n", + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); + free += next - bit; + + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), + next - bit); + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } + if (free != pa->pa_free) { + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. + */ + } + atomic_add(free, &sbi->s_mb_discarded); + + return err; +} + +static noinline_for_stack int +ext4_mb_release_group_pa(struct ext4_buddy *e4b, + struct ext4_prealloc_space *pa) +{ + struct super_block *sb = e4b->bd_sb; + ext4_group_t group; + ext4_grpblk_t bit; + + trace_ext4_mb_release_group_pa(sb, pa); + BUG_ON(pa->pa_deleted == 0); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); + mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); + atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); + + return 0; +} + +/* + * releases all preallocations in given group + * + * first, we need to decide discard policy: + * - when do we discard + * 1) ENOSPC + * - how many do we discard + * 1) how many requested + */ +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, + ext4_group_t group, int needed) +{ + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + struct list_head list; + struct ext4_buddy e4b; + int err; + int busy = 0; + int free = 0; + + mb_debug(1, "discard preallocation for group %u\n", group); + + if (list_empty(&grp->bb_prealloc_list)) + return 0; + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, "Error reading block bitmap for %u", group); + return 0; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, "Error loading buddy information for %u", group); + put_bh(bitmap_bh); + return 0; + } + + if (needed == 0) + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; + + INIT_LIST_HEAD(&list); +repeat: + ext4_lock_group(sb, group); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + spin_unlock(&pa->pa_lock); + busy = 1; + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + + /* we can trust pa_free ... */ + free += pa->pa_free; + + spin_unlock(&pa->pa_lock); + + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } + + /* if we still need more blocks and some PAs were used, try again */ + if (free < needed && busy) { + busy = 0; + ext4_unlock_group(sb, group); + /* + * Yield the CPU here so that we don't get soft lockup + * in non preempt case. + */ + yield(); + goto repeat; + } + + /* found anything to free? */ + if (list_empty(&list)) { + BUG_ON(free != 0); + goto out; + } + + /* now free all selected PAs */ + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + + /* remove from object (inode or locality group) */ + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + + if (pa->pa_type == MB_GROUP_PA) + ext4_mb_release_group_pa(&e4b, pa); + else + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + return free; +} + +/* + * releases all non-used preallocated blocks for given inode + * + * It's important to discard preallocations under i_data_sem + * We don't want another block to be served from the prealloc + * space when we are discarding the inode prealloc space. + * + * FIXME!! Make sure it is valid at all the call sites + */ +void ext4_discard_preallocations(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct super_block *sb = inode->i_sb; + struct buffer_head *bitmap_bh = NULL; + struct ext4_prealloc_space *pa, *tmp; + ext4_group_t group = 0; + struct list_head list; + struct ext4_buddy e4b; + int err; + + if (!S_ISREG(inode->i_mode)) { + /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ + return; + } + + mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + trace_ext4_discard_preallocations(inode); + + INIT_LIST_HEAD(&list); + +repeat: + /* first, collect all pa's in the inode */ + spin_lock(&ei->i_prealloc_lock); + while (!list_empty(&ei->i_prealloc_list)) { + pa = list_entry(ei->i_prealloc_list.next, + struct ext4_prealloc_space, pa_inode_list); + BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* this shouldn't happen often - nobody should + * use preallocation while we're discarding it */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); + WARN_ON(1); + schedule_timeout_uninterruptible(HZ); + goto repeat; + + } + if (pa->pa_deleted == 0) { + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &list); + continue; + } + + /* someone is deleting pa right now */ + spin_unlock(&pa->pa_lock); + spin_unlock(&ei->i_prealloc_lock); + + /* we have to wait here because pa_deleted + * doesn't mean pa is already unlinked from + * the list. as we might be called from + * ->clear_inode() the inode will get freed + * and concurrent thread which is unlinking + * pa from inode's list may access already + * freed memory, bad-bad-bad */ + + /* XXX: if this happens too often, we can + * add a flag to force wait only in case + * of ->clear_inode(), but not in case of + * regular truncate */ + schedule_timeout_uninterruptible(HZ); + goto repeat; + } + spin_unlock(&ei->i_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { + BUG_ON(pa->pa_type != MB_INODE_PA); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (bitmap_bh == NULL) { + ext4_error(sb, "Error reading block bitmap for %u", + group); + ext4_mb_unload_buddy(&e4b); + continue; + } + + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +#ifdef CONFIG_EXT4_DEBUG +static void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + struct super_block *sb = ac->ac_sb; + ext4_group_t ngroups, i; + + if (!mb_enable_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", + ac->ac_status, ac->ac_flags); + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", + (unsigned long)ac->ac_o_ex.fe_group, + (unsigned long)ac->ac_o_ex.fe_start, + (unsigned long)ac->ac_o_ex.fe_len, + (unsigned long)ac->ac_o_ex.fe_logical, + (unsigned long)ac->ac_g_ex.fe_group, + (unsigned long)ac->ac_g_ex.fe_start, + (unsigned long)ac->ac_g_ex.fe_len, + (unsigned long)ac->ac_g_ex.fe_logical, + (unsigned long)ac->ac_b_ex.fe_group, + (unsigned long)ac->ac_b_ex.fe_start, + (unsigned long)ac->ac_b_ex.fe_len, + (unsigned long)ac->ac_b_ex.fe_logical, + (int)ac->ac_criteria); + ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", + ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, + pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + NULL, &start); + spin_unlock(&pa->pa_lock); + printk(KERN_ERR "PA:%u:%d:%u \n", i, + start, pa->pa_len); + } + ext4_unlock_group(sb, i); + + if (grp->bb_free == 0) + continue; + printk(KERN_ERR "%u: %d/%d \n", + i, grp->bb_free, grp->bb_fragments); + } + printk(KERN_ERR "\n"); +} +#else +static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) +{ + return; +} +#endif + +/* + * We use locality group preallocation for small size file. The size of the + * file is determined by the current size or the resulting size after + * allocation which ever is larger + * + * One can tune this size via /sys/fs/ext4//mb_stream_req + */ +static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + + /* don't use group allocation for large files */ + size = max(size, isize); + if (size > sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having + * per cpu locality group is to reduce the contention between block + * request from multiple CPUs. + */ + ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); + + /* we're going to use group allocation */ + ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; + + /* serialize all allocations in the group */ + mutex_lock(&ac->ac_lg->lg_mutex); +} + +static noinline_for_stack int +ext4_mb_initialize_context(struct ext4_allocation_context *ac, + struct ext4_allocation_request *ar) +{ + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_group_t group; + unsigned int len; + ext4_fsblk_t goal; + ext4_grpblk_t block; + + /* we can't allocate > group size */ + len = ar->len; + + /* just a dirty hack to filter too big requests */ + if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) + len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; + + /* start searching from the goal */ + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + ext4_get_group_no_and_offset(sb, goal, &group, &block); + + /* set up allocation goals */ + memset(ac, 0, sizeof(struct ext4_allocation_context)); + ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; + ac->ac_o_ex.fe_group = group; + ac->ac_o_ex.fe_start = block; + ac->ac_o_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; + ac->ac_flags = ar->flags; + + /* we have to define context: we'll we work with a file or + * locality group. this is a policy, actually */ + ext4_mb_group_or_file(ac); + + mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + "left: %u/%u, right %u/%u to %swritable\n", + (unsigned) ar->len, (unsigned) ar->logical, + (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, + (unsigned) ar->lleft, (unsigned) ar->pleft, + (unsigned) ar->lright, (unsigned) ar->pright, + atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + return 0; + +} + +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + + mb_debug(1, "discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(pa->pa_type != MB_GROUP_PA); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + rcu_read_unlock(); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + +/* + * release all resource we used in allocation + */ +static int ext4_mb_release_context(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_type == MB_GROUP_PA) { + /* see comment in ext4_mb_use_group_pa() */ + spin_lock(&pa->pa_lock); + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } + } + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } + if (ac->ac_bitmap_page) + page_cache_release(ac->ac_bitmap_page); + if (ac->ac_buddy_page) + page_cache_release(ac->ac_buddy_page); + if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) + mutex_unlock(&ac->ac_lg->lg_mutex); + ext4_mb_collect_stats(ac); + return 0; +} + +static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) +{ + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int ret; + int freed = 0; + + trace_ext4_mb_discard_preallocations(sb, needed); + for (i = 0; i < ngroups && needed > 0; i++) { + ret = ext4_mb_discard_group_preallocations(sb, i, needed); + freed += ret; + needed -= ret; + } + + return freed; +} + +/* + * Main entry point into mballoc to allocate blocks + * it tries to use preallocation first, then falls back + * to usual allocation + */ +ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) +{ + int freed; + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; + + sb = ar->inode->i_sb; + sbi = EXT4_SB(sb); + + trace_ext4_request_blocks(ar); + + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + + /* + * For delayed allocation, we could skip the ENOSPC and + * EDQUOT check, as blocks and quotas have been already + * reserved when data being copied into pagecache. + */ + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + else { + /* Without delayed allocation we need to verify + * there is enough free blocks to do block allocation + * and verify allocation doesn't exceed the quota limits. + */ + while (ar->len && + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { + + /* let others to free the space */ + yield(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_clstrs = ar->len; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } + } + inquota = ar->len; + if (ar->len == 0) { + *errp = -EDQUOT; + goto out; + } + } + + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (!ac) { + ar->len = 0; + *errp = -ENOMEM; + goto out; + } + + *errp = ext4_mb_initialize_context(ac, ar); + if (*errp) { + ar->len = 0; + goto out; + } + + ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + if (!ext4_mb_use_preallocated(ac)) { + ac->ac_op = EXT4_MB_HISTORY_ALLOC; + ext4_mb_normalize_request(ac, ar); +repeat: + /* allocate space in core */ + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto errout; + + /* as we've just preallocated more space than + * user requested orinally, we store allocated + * space in a special descriptor */ + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) + errout: + ext4_discard_allocated_blocks(ac); + else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } + } else { + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) + goto repeat; + *errp = -ENOSPC; + } + + if (*errp) { + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } + ext4_mb_release_context(ac); +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); + if (!ar->len) { + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + } + + trace_ext4_allocate_blocks(ar, (unsigned long long)block); + + return block; +} + +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) + return 1; + return 0; +} + +static noinline_for_stack int +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, + struct ext4_free_data *new_entry) +{ + ext4_group_t group = e4b->bd_group; + ext4_grpblk_t cluster; + struct ext4_free_data *entry; + struct ext4_group_info *db = e4b->bd_info; + struct super_block *sb = e4b->bd_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + + BUG_ON(!ext4_handle_valid(handle)); + BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_buddy_page == NULL); + + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; + + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) + n = &(*n)->rb_left; + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) + n = &(*n)->rb_right; + else { + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), + "Block already on to-be-freed list"); + return 0; + } + } + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(entry, new_entry)) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); + } + } + + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(new_entry, entry)) { + new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); + ext4_journal_callback_del(handle, &entry->efd_jce); + kmem_cache_free(ext4_free_data_cachep, entry); + } + } + /* Add the extent to transaction's private list */ + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); + return 0; +} + +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @flags: flags used by ext4_free_blocks + */ +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) +{ + struct buffer_head *bitmap_bh = NULL; + struct super_block *sb = inode->i_sb; + struct ext4_group_desc *gdp; + unsigned long freed = 0; + unsigned int overflow; + ext4_grpblk_t bit; + struct buffer_head *gd_bh; + ext4_group_t block_group; + struct ext4_sb_info *sbi; + struct ext4_buddy e4b; + unsigned int count_clusters; + int err = 0; + int ret; + + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } + + sbi = EXT4_SB(sb); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); + goto error_return; + } + + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + if (unlikely(!tbh)) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; + + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = block & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = count & (sbi->s_cluster_ratio - 1); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; + } + +do_more: + overflow = 0; + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + count_clusters = EXT4_B2C(sbi, count); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + gdp = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!gdp) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, gdp), block, count) || + in_range(ext4_inode_bitmap(sb, gdp), block, count) || + in_range(block, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group)) { + + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; +#ifdef AGGRESSIVE_CHECK + { + int i; + for (i = 0; i < count_clusters; i++) + BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); + } +#endif + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); + if (!new_entry) { + ext4_mb_unload_buddy(&e4b); + err = -ENOMEM; + goto error_return; + } + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; + + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + ext4_mb_free_metadata(handle, &e4b, new_entry); + } else { + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); + } + + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); + } + + ext4_mb_unload_buddy(&e4b); + + freed += count; + + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + put_bh(bitmap_bh); + goto do_more; + } + ext4_mark_super_dirty(sb); +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return; +} + +/** + * ext4_group_add_blocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + if (count == 0) + return 0; + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; + goto error_return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + err = -EINVAL; + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, blocks_freed)); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic_add(EXT4_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return err; +} + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static void ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct ext4_free_extent ex; + + trace_ext4_trim_extent(sb, group, start, count); + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0, free_count = 0; + struct ext4_buddy e4b; + int ret; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + next = mb_find_next_bit(bitmap, max + 1, start); + + if ((next - start) >= minblocks) { + ext4_trim_extent(sb, start, + next - start, group, &e4b); + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b.bd_info->bb_free - free_count) < minblocks) + break; + } + + if (!ret) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + return count; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_group_info *grp; + ext4_group_t group, first_group, last_group; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; + uint64_t start, end, minlen, trimmed = 0; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + + if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || + unlikely(start >= max_blks)) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_cluster); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, + &last_group, &last_cluster); + + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; + } + + /* + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() + */ + if (group == last_group) + end = last_cluster; + + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_cluster, + end, minlen); + if (cnt < 0) { + ret = cnt; + break; + } + trimmed += cnt; + } + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ + first_cluster = 0; + } + + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: + range->len = trimmed * sb->s_blocksize; + return ret; +} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h new file mode 100644 index 00000000..c070618c --- /dev/null +++ b/fs/ext4/mballoc.h @@ -0,0 +1,222 @@ +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + */ +#ifdef CONFIG_EXT4_DEBUG +extern u8 mb_enable_debug; + +#define mb_debug(n, fmt, a...) \ + do { \ + if ((n) <= mb_enable_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __func__); \ + printk(fmt, ## a); \ + } \ + } while (0) +#else +#define mb_debug(n, fmt, a...) +#endif + +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * How many groups mballoc will scan looking for the best chunk + */ +#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 + +/* + * with 'ext4_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 0 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4//stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + + +struct ext4_free_data { + /* MUST be the first member */ + struct ext4_journal_cb_entry efd_jce; + + /* ext4_free_data private data starts from here */ + + /* this links the free block information from group_info */ + struct rb_node efd_node; + + /* group which free block extent belongs */ + ext4_group_t efd_group; + + /* free block extent */ + ext4_grpblk_t efd_start_cluster; + ext4_grpblk_t efd_count; + + /* transaction which freed this extent */ + tid_t efd_tid; +}; + +struct ext4_prealloc_space { + struct list_head pa_inode_list; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ + spinlock_t *pa_obj_lock; + struct inode *pa_inode; /* hack, for history only */ +}; + +enum { + MB_INODE_PA = 0, + MB_GROUP_PA = 1 +}; + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; /* In cluster units */ + ext4_group_t fe_group; + ext4_grpblk_t fe_len; /* In cluster units */ +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; + */ +#define PREALLOC_TB_SIZE 10 +struct ext4_locality_group { + /* for allocator */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (normalized ac_o_ex) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + struct page *ac_bitmap_page; + struct page *ac_buddy_page; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_buddy { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; +}; + +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + return ext4_group_first_block_no(sb, fex->fe_group) + + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); +} +#endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c new file mode 100644 index 00000000..f39f80f8 --- /dev/null +++ b/fs/ext4/migrate.c @@ -0,0 +1,604 @@ +/* + * Copyright IBM Corporation, 2007 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include "ext4_jbd2.h" + +/* + * The contiguous blocks details which can be + * represented by a single extent + */ +struct migrate_struct { + ext4_lblk_t first_block, last_block, curr_block; + ext4_fsblk_t first_pblock, last_pblock; +}; + +static int finish_range(handle_t *handle, struct inode *inode, + struct migrate_struct *lb) + +{ + int retval = 0, needed; + struct ext4_extent newext; + struct ext4_ext_path *path; + if (lb->first_pblock == 0) + return 0; + + /* Add the extent to temp inode*/ + newext.ee_block = cpu_to_le32(lb->first_block); + newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); + ext4_ext_store_pblock(&newext, lb->first_pblock); + path = ext4_ext_find_extent(inode, lb->first_block, NULL); + + if (IS_ERR(path)) { + retval = PTR_ERR(path); + path = NULL; + goto err_out; + } + + /* + * Calculate the credit needed to inserting this extent + * Since we are doing this in loop we may accumalate extra + * credit. But below we try to not accumalate too much + * of them by restarting the journal. + */ + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); + + /* + * Make sure the credit we accumalated is not really high + */ + if (needed && ext4_handle_has_enough_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS)) { + retval = ext4_journal_restart(handle, needed); + if (retval) + goto err_out; + } else if (needed) { + retval = ext4_journal_extend(handle, needed); + if (retval) { + /* + * IF not able to extend the journal restart the journal + */ + retval = ext4_journal_restart(handle, needed); + if (retval) + goto err_out; + } + } + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); +err_out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + lb->first_pblock = 0; + return retval; +} + +static int update_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, struct migrate_struct *lb) +{ + int retval; + /* + * See if we can add on to the existing range (if it exists) + */ + if (lb->first_pblock && + (lb->last_pblock+1 == pblock) && + (lb->last_block+1 == lb->curr_block)) { + lb->last_pblock = pblock; + lb->last_block = lb->curr_block; + lb->curr_block++; + return 0; + } + /* + * Start a new range. + */ + retval = finish_range(handle, inode, lb); + lb->first_pblock = lb->last_pblock = pblock; + lb->first_block = lb->last_block = lb->curr_block; + lb->curr_block++; + return retval; +} + +static int update_ind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + lb->curr_block++; + } + } + put_bh(bh); + return retval; + +} + +static int update_dind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_ind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + /* Only update the file block number */ + lb->curr_block += max_entries; + } + } + put_bh(bh); + return retval; + +} + +static int update_tind_extent_range(handle_t *handle, struct inode *inode, + ext4_fsblk_t pblock, + struct migrate_struct *lb) +{ + struct buffer_head *bh; + __le32 *i_data; + int i, retval = 0; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, pblock); + if (!bh) + return -EIO; + + i_data = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (i_data[i]) { + retval = update_dind_extent_range(handle, inode, + le32_to_cpu(i_data[i]), lb); + if (retval) + break; + } else { + /* Only update the file block number */ + lb->curr_block += max_entries * max_entries; + } + } + put_bh(bh); + return retval; + +} + +static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) +{ + int retval = 0, needed; + + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) + return 0; + /* + * We are freeing a blocks. During this we touch + * superblock, group descriptor and block bitmap. + * So allocate a credit of 3. We may update + * quota (user and group). + */ + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + if (ext4_journal_extend(handle, needed) != 0) + retval = ext4_journal_restart(handle, needed); + + return retval; +} + +static int free_dind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(tmp_idata[i]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_tind_blocks(handle_t *handle, + struct inode *inode, __le32 i_data) +{ + int i, retval = 0; + __le32 *tmp_idata; + struct buffer_head *bh; + unsigned long max_entries = inode->i_sb->s_blocksize >> 2; + + bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); + if (!bh) + return -EIO; + + tmp_idata = (__le32 *)bh->b_data; + for (i = 0; i < max_entries; i++) { + if (tmp_idata[i]) { + retval = free_dind_blocks(handle, + inode, tmp_idata[i]); + if (retval) { + put_bh(bh); + return retval; + } + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + return 0; +} + +static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) +{ + int retval; + + /* ei->i_data[EXT4_IND_BLOCK] */ + if (i_data[0]) { + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, + le32_to_cpu(i_data[0]), 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + } + + /* ei->i_data[EXT4_DIND_BLOCK] */ + if (i_data[1]) { + retval = free_dind_blocks(handle, inode, i_data[1]); + if (retval) + return retval; + } + + /* ei->i_data[EXT4_TIND_BLOCK] */ + if (i_data[2]) { + retval = free_tind_blocks(handle, inode, i_data[2]); + if (retval) + return retval; + } + return 0; +} + +static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, + struct inode *tmp_inode) +{ + int retval; + __le32 i_data[3]; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); + + /* + * One credit accounted for writing the + * i_data field of the original inode + */ + retval = ext4_journal_extend(handle, 1); + if (retval) { + retval = ext4_journal_restart(handle, 1); + if (retval) + goto err_out; + } + + i_data[0] = ei->i_data[EXT4_IND_BLOCK]; + i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; + i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; + + down_write(&EXT4_I(inode)->i_data_sem); + /* + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across + */ + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); + + /* + * Update i_blocks with the new blocks that got + * allocated while adding extents for extent index + * blocks. + * + * While converting to extents we need not + * update the orignal inode i_blocks for extent blocks + * via quota APIs. The quota update happened via tmp_inode already. + */ + spin_lock(&inode->i_lock); + inode->i_blocks += tmp_inode->i_blocks; + spin_unlock(&inode->i_lock); + up_write(&EXT4_I(inode)->i_data_sem); + + /* + * We mark the inode dirty after, because we decrement the + * i_blocks when freeing the indirect meta-data blocks + */ + retval = free_ind_block(handle, inode, i_data); + ext4_mark_inode_dirty(handle, inode); + +err_out: + return retval; +} + +static int free_ext_idx(handle_t *handle, struct inode *inode, + struct ext4_extent_idx *ix) +{ + int i, retval = 0; + ext4_fsblk_t block; + struct buffer_head *bh; + struct ext4_extent_header *eh; + + block = ext4_idx_pblock(ix); + bh = sb_bread(inode->i_sb, block); + if (!bh) + return -EIO; + + eh = (struct ext4_extent_header *)bh->b_data; + if (eh->eh_depth != 0) { + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + break; + } + } + put_bh(bh); + extend_credit_for_blkdel(handle, inode); + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + return retval; +} + +/* + * Free the extent meta data blocks only + */ +static int free_ext_block(handle_t *handle, struct inode *inode) +{ + int i, retval = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; + struct ext4_extent_idx *ix; + if (eh->eh_depth == 0) + /* + * No extra blocks allocated for extent meta data + */ + return 0; + ix = EXT_FIRST_INDEX(eh); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { + retval = free_ext_idx(handle, inode, ix); + if (retval) + return retval; + } + return retval; + +} + +int ext4_ext_migrate(struct inode *inode) +{ + handle_t *handle; + int retval = 0, i; + __le32 *i_data; + struct ext4_inode_info *ei; + struct inode *tmp_inode = NULL; + struct migrate_struct lb; + unsigned long max_entries; + __u32 goal; + uid_t owner[2]; + + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) + /* + * don't migrate fast symlink + */ + return retval; + + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + 1); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + return retval; + } + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + owner[0] = inode->i_uid; + owner[1] = inode->i_gid; + tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG, NULL, goal, owner); + if (IS_ERR(tmp_inode)) { + retval = PTR_ERR(tmp_inode); + ext4_journal_stop(handle); + return retval; + } + i_size_write(tmp_inode, i_size_read(inode)); + /* + * Set the i_nlink to zero so it will be deleted later + * when we drop inode reference. + */ + clear_nlink(tmp_inode); + + ext4_ext_tree_init(handle, tmp_inode); + ext4_orphan_add(handle, tmp_inode); + ext4_journal_stop(handle); + + /* + * start with one credit accounted for + * superblock modification. + * + * For the tmp_inode we already have committed the + * trascation that created the inode. Later as and + * when we add extents we extent the journal + */ + /* + * Even though we take i_mutex we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + /* + * It is impossible to update on-disk structures without + * a handle, so just rollback in-core changes and live other + * work to orphan_list_cleanup() + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); + goto out; + } + + ei = EXT4_I(inode); + i_data = ei->i_data; + memset(&lb, 0, sizeof(lb)); + + /* 32 bit block address 4 bytes */ + max_entries = inode->i_sb->s_blocksize >> 2; + for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { + if (i_data[i]) { + retval = update_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[i]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block++; + } + if (i_data[EXT4_IND_BLOCK]) { + retval = update_ind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block += max_entries; + if (i_data[EXT4_DIND_BLOCK]) { + retval = update_dind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); + if (retval) + goto err_out; + } else + lb.curr_block += max_entries * max_entries; + if (i_data[EXT4_TIND_BLOCK]) { + retval = update_tind_extent_range(handle, tmp_inode, + le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); + if (retval) + goto err_out; + } + /* + * Build the last extent + */ + retval = finish_range(handle, tmp_inode, &lb); +err_out: + if (retval) + /* + * Failure case delete the extent information with the + * tmp_inode + */ + free_ext_block(handle, tmp_inode); + else { + retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); + if (retval) + /* + * if we fail to swap inode data free the extent + * details of the tmp inode + */ + free_ext_block(handle, tmp_inode); + } + + /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ + if (ext4_journal_extend(handle, 1) != 0) + ext4_journal_restart(handle, 1); + + /* + * Mark the tmp_inode as of size zero + */ + i_size_write(tmp_inode, 0); + + /* + * set the i_blocks count to zero + * so that the ext4_delete_inode does the + * right job + * + * We don't need to take the i_lock because + * the inode is not visible to user space. + */ + tmp_inode->i_blocks = 0; + + /* Reset the extent details */ + ext4_ext_tree_init(handle, tmp_inode); + ext4_journal_stop(handle); +out: + unlock_new_inode(tmp_inode); + iput(tmp_inode); + + return retval; +} diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c new file mode 100644 index 00000000..ed6548d8 --- /dev/null +++ b/fs/ext4/mmp.c @@ -0,0 +1,353 @@ +#include +#include +#include +#include +#include + +#include "ext4.h" + +/* + * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * faster. + */ +static int write_mmp_block(struct buffer_head *bh) +{ + mark_buffer_dirty(bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(WRITE_SYNC, bh); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + return 1; + + return 0; +} + +/* + * Read the MMP block. It _must_ be read from disk and hence we clear the + * uptodate flag on the buffer. + */ +static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, + ext4_fsblk_t mmp_block) +{ + struct mmp_struct *mmp; + + if (*bh) + clear_buffer_uptodate(*bh); + + /* This would be sb_bread(sb, mmp_block), except we need to be sure + * that the MD RAID device cache has been bypassed, and that the read + * is not blocked in the elevator. */ + if (!*bh) + *bh = sb_getblk(sb, mmp_block); + if (*bh) { + get_bh(*bh); + lock_buffer(*bh); + (*bh)->b_end_io = end_buffer_read_sync; + submit_bh(READ_SYNC, *bh); + wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + brelse(*bh); + *bh = NULL; + } + } + if (!*bh) { + ext4_warning(sb, "Error while reading MMP block %llu", + mmp_block); + return -EIO; + } + + mmp = (struct mmp_struct *)((*bh)->b_data); + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + return -EINVAL; + + return 0; +} + +/* + * Dump as much information as possible to help the admin. + */ +void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, + const char *function, unsigned int line, const char *msg) +{ + __ext4_warning(sb, function, line, msg); + __ext4_warning(sb, function, line, + "MMP failure info: last update time: %llu, last update " + "node: %s, last update device: %s\n", + (long long unsigned int) le64_to_cpu(mmp->mmp_time), + mmp->mmp_nodename, mmp->mmp_bdevname); +} + +/* + * kmmpd will update the MMP sequence every s_mmp_update_interval seconds + */ +static int kmmpd(void *data) +{ + struct super_block *sb = ((struct mmpd_data *) data)->sb; + struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct mmp_struct *mmp; + ext4_fsblk_t mmp_block; + u32 seq = 0; + unsigned long failed_writes = 0; + int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned mmp_check_interval; + unsigned long last_update_time; + unsigned long diff; + int retval; + + mmp_block = le64_to_cpu(es->s_mmp_block); + mmp = (struct mmp_struct *)(bh->b_data); + mmp->mmp_time = cpu_to_le64(get_seconds()); + /* + * Start with the higher mmp_check_interval and reduce it if + * the MMP block is being updated on time. + */ + mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + bdevname(bh->b_bdev, mmp->mmp_bdevname); + + memcpy(mmp->mmp_nodename, init_utsname()->nodename, + sizeof(mmp->mmp_nodename)); + + while (!kthread_should_stop()) { + if (++seq > EXT4_MMP_SEQ_MAX) + seq = 1; + + mmp->mmp_seq = cpu_to_le32(seq); + mmp->mmp_time = cpu_to_le64(get_seconds()); + last_update_time = jiffies; + + retval = write_mmp_block(bh); + /* + * Don't spew too many error messages. Print one every + * (s_mmp_update_interval * 60) seconds. + */ + if (retval) { + if ((failed_writes % 60) == 0) + ext4_error(sb, "Error writing to MMP block"); + failed_writes++; + } + + if (!(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_MMP)) { + ext4_warning(sb, "kmmpd being stopped since MMP feature" + " has been disabled."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + if (sb->s_flags & MS_RDONLY) { + ext4_warning(sb, "kmmpd being stopped since filesystem " + "has been remounted as readonly."); + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + diff = jiffies - last_update_time; + if (diff < mmp_update_interval * HZ) + schedule_timeout_interruptible(mmp_update_interval * + HZ - diff); + + /* + * We need to make sure that more than mmp_check_interval + * seconds have not passed since writing. If that has happened + * we need to check if the MMP block is as we left it. + */ + diff = jiffies - last_update_time; + if (diff > mmp_check_interval * HZ) { + struct buffer_head *bh_check = NULL; + struct mmp_struct *mmp_check; + + retval = read_mmp_block(sb, &bh_check, mmp_block); + if (retval) { + ext4_error(sb, "error reading MMP data: %d", + retval); + + EXT4_SB(sb)->s_mmp_tsk = NULL; + goto failed; + } + + mmp_check = (struct mmp_struct *)(bh_check->b_data); + if (mmp->mmp_seq != mmp_check->mmp_seq || + memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, + sizeof(mmp->mmp_nodename))) { + dump_mmp_msg(sb, mmp_check, + "Error while updating MMP info. " + "The filesystem seems to have been" + " multiply mounted."); + ext4_error(sb, "abort"); + goto failed; + } + put_bh(bh_check); + } + + /* + * Adjust the mmp_check_interval depending on how much time + * it took for the MMP block to be written. + */ + mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MAX_CHECK_INTERVAL), + EXT4_MMP_MIN_CHECK_INTERVAL); + mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + } + + /* + * Unmount seems to be clean. + */ + mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); + mmp->mmp_time = cpu_to_le64(get_seconds()); + + retval = write_mmp_block(bh); + +failed: + kfree(data); + brelse(bh); + return retval; +} + +/* + * Get a random new sequence number but make sure it is not greater than + * EXT4_MMP_SEQ_MAX. + */ +static unsigned int mmp_new_seq(void) +{ + u32 new_seq; + + do { + get_random_bytes(&new_seq, sizeof(u32)); + } while (new_seq > EXT4_MMP_SEQ_MAX); + + return new_seq; +} + +/* + * Protect the filesystem from being mounted more than once. + */ +int ext4_multi_mount_protect(struct super_block *sb, + ext4_fsblk_t mmp_block) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = NULL; + struct mmp_struct *mmp = NULL; + struct mmpd_data *mmpd_data; + u32 seq; + unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); + unsigned int wait_time = 0; + int retval; + + if (mmp_block < le32_to_cpu(es->s_first_data_block) || + mmp_block >= ext4_blocks_count(es)) { + ext4_warning(sb, "Invalid MMP block in superblock"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + + mmp = (struct mmp_struct *)(bh->b_data); + + if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) + mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; + + /* + * If check_interval in MMP block is larger, use that instead of + * update_interval from the superblock. + */ + if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) + mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); + + seq = le32_to_cpu(mmp->mmp_seq); + if (seq == EXT4_MMP_SEQ_CLEAN) + goto skip; + + if (seq == EXT4_MMP_SEQ_FSCK) { + dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); + goto failed; + } + + wait_time = min(mmp_check_interval * 2 + 1, + mmp_check_interval + 60); + + /* Print MMP interval if more than 20 secs. */ + if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) + ext4_warning(sb, "MMP interval %u higher than expected, please" + " wait.\n", wait_time * 2); + + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + +skip: + /* + * write a new random sequence number. + */ + seq = mmp_new_seq(); + mmp->mmp_seq = cpu_to_le32(seq); + + retval = write_mmp_block(bh); + if (retval) + goto failed; + + /* + * wait for MMP interval and check mmp_seq. + */ + if (schedule_timeout_interruptible(HZ * wait_time) != 0) { + ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + goto failed; + } + + retval = read_mmp_block(sb, &bh, mmp_block); + if (retval) + goto failed; + mmp = (struct mmp_struct *)(bh->b_data); + if (seq != le32_to_cpu(mmp->mmp_seq)) { + dump_mmp_msg(sb, mmp, + "Device is already active on another node."); + goto failed; + } + + mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + if (!mmpd_data) { + ext4_warning(sb, "not enough memory for mmpd_data"); + goto failed; + } + mmpd_data->sb = sb; + mmpd_data->bh = bh; + + /* + * Start a kernel thread to update the MMP block periodically. + */ + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + bdevname(bh->b_bdev, + mmp->mmp_bdevname)); + if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { + EXT4_SB(sb)->s_mmp_tsk = NULL; + kfree(mmpd_data); + ext4_warning(sb, "Unable to create kmmpd thread for %s.", + sb->s_id); + goto failed; + } + + return 0; + +failed: + brelse(bh); + return 1; +} + + diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c new file mode 100644 index 00000000..c5826c62 --- /dev/null +++ b/fs/ext4/move_extent.c @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. + * Written by Takashi Sato + * Akira Fujita + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" + +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **path) +{ + int ret = 0; + + *path = ext4_ext_find_extent(inode, lblock, *path); + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; + } else if ((*path)[ext_depth(inode)].p_ext == NULL) + ret = -ENODATA; + + return ret; +} + +/** + * copy_extent_status - Copy the extent's initialization status + * + * @src: an extent for getting initialize status + * @dest: an extent to be set the status + */ +static void +copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) +{ + if (ext4_ext_is_uninitialized(src)) + ext4_ext_mark_uninitialized(dest); + else + dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); +} + +/** + * mext_next_extent - Search for the next extent and set it to "extent" + * + * @inode: inode which is searched + * @path: this will obtain data for the next extent + * @extent: pointer to the next extent we have just gotten + * + * Search the next extent in the array of ext4_ext_path structure (@path) + * and set it to ext4_extent structure (@extent). In addition, the member of + * @path (->p_ext) also points the next extent. Return 0 on success, 1 if + * ext4_ext_path structure refers to the last extent, or a negative error + * value on failure. + */ +static int +mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent) +{ + struct ext4_extent_header *eh; + int ppos, leaf_ppos = path->p_depth; + + ppos = leaf_ppos; + if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { + /* leaf block */ + *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + return 0; + } + + while (--ppos >= 0) { + if (EXT_LAST_INDEX(path[ppos].p_hdr) > + path[ppos].p_idx) { + int cur_ppos = ppos; + + /* index block */ + path[ppos].p_idx++; + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + if (path[ppos+1].p_bh) + brelse(path[ppos+1].p_bh); + path[ppos+1].p_bh = + sb_bread(inode->i_sb, path[ppos].p_block); + if (!path[ppos+1].p_bh) + return -EIO; + path[ppos+1].p_hdr = + ext_block_hdr(path[ppos+1].p_bh); + + /* Halfway index block */ + while (++cur_ppos < leaf_ppos) { + path[cur_ppos].p_idx = + EXT_FIRST_INDEX(path[cur_ppos].p_hdr); + path[cur_ppos].p_block = + ext4_idx_pblock(path[cur_ppos].p_idx); + if (path[cur_ppos+1].p_bh) + brelse(path[cur_ppos+1].p_bh); + path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, + path[cur_ppos].p_block); + if (!path[cur_ppos+1].p_bh) + return -EIO; + path[cur_ppos+1].p_hdr = + ext_block_hdr(path[cur_ppos+1].p_bh); + } + + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + + /* leaf block */ + path[leaf_ppos].p_ext = *extent = + EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext4_ext_pblock(path[leaf_ppos].p_ext); + return 0; + } + } + /* We found the last extent */ + return 1; +} + +/** + * mext_check_null_inode - NULL check for two inodes + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_check_null_inode(struct inode *inode1, struct inode *inode2, + const char *function, unsigned int line) +{ + int ret = 0; + + if (inode1 == NULL) { + __ext4_error(inode2->i_sb, function, line, + "Both inodes should not be NULL: " + "inode1 NULL inode2 %lu", inode2->i_ino); + ret = -EIO; + } else if (inode2 == NULL) { + __ext4_error(inode1->i_sb, function, line, + "Both inodes should not be NULL: " + "inode1 %lu inode2 NULL", inode1->i_ino); + ret = -EIO; + } + return ret; +} + +/** + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure + * @donor_inode: donor inode structure + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. + */ +static void +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +{ + struct inode *first = orig_inode, *second = donor_inode; + + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can + * compare pointers that don't come from the same array. + */ + if (donor_inode->i_ino < orig_inode->i_ino) { + first = donor_inode; + second = orig_inode; + } + + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); +} + +/** + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release write lock of i_data_sem of two inodes (orig and donor). + */ +static void +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +{ + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_insert_across_blocks - Insert extents across leaf block + * + * @handle: journal handle + * @orig_inode: original inode + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Allocate a new leaf block and insert extents into it. Return 0 on success, + * or a negative error value on failure. + */ +static int +mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, + struct ext4_extent *o_start, struct ext4_extent *o_end, + struct ext4_extent *start_ext, struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_ext_path *orig_path = NULL; + ext4_lblk_t eblock = 0; + int new_flag = 0; + int end_flag = 0; + int err = 0; + + if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { + if (o_start == o_end) { + + /* start_ext new_ext end_ext + * donor |---------|-----------|--------| + * orig |------------------------------| + */ + end_flag = 1; + } else { + + /* start_ext new_ext end_ext + * donor |---------|----------|---------| + * orig |---------------|--------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); + } + + o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (start_ext->ee_len && new_ext->ee_len && + !end_ext->ee_len && o_start == o_end) { + + /* start_ext new_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (!start_ext->ee_len && new_ext->ee_len && + end_ext->ee_len && o_start == o_end) { + + /* new_ext end_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); + + /* + * Set 0 to the extent block if new_ext was + * the first block. + */ + if (new_ext->ee_block) + eblock = le32_to_cpu(new_ext->ee_block); + + new_flag = 1; + } else { + ext4_debug("ext4 move extent: Unexpected insert case\n"); + return -EIO; + } + + if (new_flag) { + err = get_ext_path(orig_inode, eblock, &orig_path); + if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, new_ext, 0)) + goto out; + } + + if (end_flag) { + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, end_ext, 0)) + goto out; + } +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + + return err; + +} + +/** + * mext_insert_inside_block - Insert new extent to the extent block + * + * @o_start: first original extent to be moved + * @o_end: last original extent to be moved + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * @eh: extent header of target leaf block + * @range_to_move: used to decide how to insert extent + * + * Insert extents into the leaf block. The extent (@o_start) is overwritten + * by inserted extents. + */ +static void +mext_insert_inside_block(struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext, + struct ext4_extent_header *eh, + int range_to_move) +{ + int i = 0; + unsigned long len; + + /* Move the existing extents */ + if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { + len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - + (unsigned long)(o_end + 1); + memmove(o_end + 1 + range_to_move, o_end + 1, len); + } + + /* Insert start entry */ + if (start_ext->ee_len) + o_start[i++].ee_len = start_ext->ee_len; + + /* Insert new entry */ + if (new_ext->ee_len) { + o_start[i] = *new_ext; + ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); + } + + /* Insert end entry */ + if (end_ext->ee_len) + o_start[i] = *end_ext; + + /* Increment the total entries counter on the extent block */ + le16_add_cpu(&eh->eh_entries, range_to_move); +} + +/** + * mext_insert_extents - Insert new extent + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Call the function to insert extents. If we cannot add more extents into + * the leaf block, we call mext_insert_across_blocks() to create a + * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 + * on success, or a negative error value on failure. + */ +static int +mext_insert_extents(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, + struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_extent_header *eh; + unsigned long need_slots, slots_range; + int range_to_move, depth, ret; + + /* + * The extents need to be inserted + * start_extent + new_extent + end_extent. + */ + need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + + (new_ext->ee_len ? 1 : 0); + + /* The number of slots between start and end */ + slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) + / sizeof(struct ext4_extent); + + /* Range to move the end of extent */ + range_to_move = need_slots - slots_range; + depth = orig_path->p_depth; + orig_path += depth; + eh = orig_path->p_hdr; + + if (depth) { + /* Register to journal */ + ret = ext4_journal_get_write_access(handle, orig_path->p_bh); + if (ret) + return ret; + } + + /* Expansion */ + if (range_to_move > 0 && + (range_to_move > le16_to_cpu(eh->eh_max) + - le16_to_cpu(eh->eh_entries))) { + + ret = mext_insert_across_blocks(handle, orig_inode, o_start, + o_end, start_ext, new_ext, end_ext); + if (ret < 0) + return ret; + } else + mext_insert_inside_block(o_start, o_end, start_ext, new_ext, + end_ext, eh, range_to_move); + + if (depth) { + ret = ext4_handle_dirty_metadata(handle, orig_inode, + orig_path->p_bh); + if (ret) + return ret; + } else { + ret = ext4_mark_inode_dirty(handle, orig_inode); + if (ret < 0) + return ret; + } + + return 0; +} + +/** + * mext_leaf_block - Move one leaf extent block into the inode. + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @dext: donor extent + * @from: start offset on the target file + * + * In order to insert extents into the leaf block, we must divide the extent + * in the leaf block into three extents. The one is located to be inserted + * extents, and the others are located around it. + * + * Therefore, this function creates structures to save extents of the leaf + * block, and inserts extents by calling mext_insert_extents() with + * created extents. Return 0 on success, or a negative error value on failure. + */ +static int +mext_leaf_block(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, struct ext4_extent *dext, + ext4_lblk_t *from) +{ + struct ext4_extent *oext, *o_start, *o_end, *prev_ext; + struct ext4_extent new_ext, start_ext, end_ext; + ext4_lblk_t new_ext_end; + int oext_alen, new_ext_alen, end_ext_alen; + int depth = ext_depth(orig_inode); + int ret; + + start_ext.ee_block = end_ext.ee_block = 0; + o_start = o_end = oext = orig_path[depth].p_ext; + oext_alen = ext4_ext_get_actual_len(oext); + start_ext.ee_len = end_ext.ee_len = 0; + + new_ext.ee_block = cpu_to_le32(*from); + ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); + new_ext.ee_len = dext->ee_len; + new_ext_alen = ext4_ext_get_actual_len(&new_ext); + new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; + + /* + * Case: original extent is first + * oext |--------| + * new_ext |--| + * start_ext |--| + */ + if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && + le32_to_cpu(new_ext.ee_block) < + le32_to_cpu(oext->ee_block) + oext_alen) { + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - + le32_to_cpu(oext->ee_block)); + start_ext.ee_block = oext->ee_block; + copy_extent_status(oext, &start_ext); + } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { + prev_ext = oext - 1; + /* + * We can merge new_ext into previous extent, + * if these are contiguous and same extent type. + */ + if (ext4_can_extents_be_merged(orig_inode, prev_ext, + &new_ext)) { + o_start = prev_ext; + start_ext.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(prev_ext) + + new_ext_alen); + start_ext.ee_block = oext->ee_block; + copy_extent_status(prev_ext, &start_ext); + new_ext.ee_len = 0; + } + } + + /* + * Case: new_ext_end must be less than oext + * oext |-----------| + * new_ext |-------| + */ + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { + EXT4_ERROR_INODE(orig_inode, + "new_ext_end(%u) should be less than or equal to " + "oext->ee_block(%u) + oext_alen(%d) - 1", + new_ext_end, le32_to_cpu(oext->ee_block), + oext_alen); + ret = -EIO; + goto out; + } + + /* + * Case: new_ext is smaller than original extent + * oext |---------------| + * new_ext |-----------| + * end_ext |---| + */ + if (le32_to_cpu(oext->ee_block) <= new_ext_end && + new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { + end_ext.ee_len = + cpu_to_le16(le32_to_cpu(oext->ee_block) + + oext_alen - 1 - new_ext_end); + copy_extent_status(oext, &end_ext); + end_ext_alen = ext4_ext_get_actual_len(&end_ext); + ext4_ext_store_pblock(&end_ext, + (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); + end_ext.ee_block = + cpu_to_le32(le32_to_cpu(o_end->ee_block) + + oext_alen - end_ext_alen); + } + + ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, + o_end, &start_ext, &new_ext, &end_ext); +out: + return ret; +} + +/** + * mext_calc_swap_extents - Calculate extents for extent swapping. + * + * @tmp_dext: the extent that will belong to the original inode + * @tmp_oext: the extent that will belong to the donor inode + * @orig_off: block offset of original inode + * @donor_off: block offset of donor inode + * @max_count: the maximum length of extents + * + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_calc_swap_extents(struct ext4_extent *tmp_dext, + struct ext4_extent *tmp_oext, + ext4_lblk_t orig_off, ext4_lblk_t donor_off, + ext4_lblk_t max_count) +{ + ext4_lblk_t diff, orig_diff; + struct ext4_extent dext_old, oext_old; + + BUG_ON(orig_off != donor_off); + + /* original and donor extents have to cover the same block offset */ + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || + le32_to_cpu(tmp_oext->ee_block) + + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) + return -ENODATA; + + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || + le32_to_cpu(tmp_dext->ee_block) + + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) + return -ENODATA; + + dext_old = *tmp_dext; + oext_old = *tmp_oext; + + /* When tmp_dext is too large, pick up the target range. */ + diff = donor_off - le32_to_cpu(tmp_dext->ee_block); + + ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); + tmp_dext->ee_block = + cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + + if (max_count < ext4_ext_get_actual_len(tmp_dext)) + tmp_dext->ee_len = cpu_to_le16(max_count); + + orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); + ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); + + /* Adjust extent length if donor extent is larger than orig */ + if (ext4_ext_get_actual_len(tmp_dext) > + ext4_ext_get_actual_len(tmp_oext) - orig_diff) + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - + orig_diff); + + tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); + + copy_extent_status(&oext_old, tmp_dext); + copy_extent_status(&dext_old, tmp_oext); + + return 0; +} + +/** + * mext_replace_branches - Replace original extents with new extents + * + * @handle: journal handle + * @orig_inode: original inode + * @donor_inode: donor inode + * @from: block offset of orig_inode + * @count: block count to be replaced + * @err: pointer to save return value + * + * Replace original inode extents and donor inode extents page by page. + * We implement this replacement in the following three steps: + * 1. Save the block information of original and donor inodes into + * dummy extents. + * 2. Change the block information of original inode to point at the + * donor inode blocks. + * 3. Change the block information of donor inode to point at the saved + * original inode blocks in the dummy extents. + * + * Return replaced block count. + */ +static int +mext_replace_branches(handle_t *handle, struct inode *orig_inode, + struct inode *donor_inode, ext4_lblk_t from, + ext4_lblk_t count, int *err) +{ + struct ext4_ext_path *orig_path = NULL; + struct ext4_ext_path *donor_path = NULL; + struct ext4_extent *oext, *dext; + struct ext4_extent tmp_dext, tmp_oext; + ext4_lblk_t orig_off = from, donor_off = from; + int depth; + int replaced_count = 0; + int dext_alen; + + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + + /* Get the original extent for the block "orig_off" */ + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) + goto out; + + /* Get the donor extent for the head */ + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count); + if (*err) + goto out; + + /* Loop for the donor extents */ + while (1) { + /* The extent for donor must be found. */ + if (!dext) { + EXT4_ERROR_INODE(donor_inode, + "The extent for donor must be found"); + *err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + EXT4_ERROR_INODE(donor_inode, + "Donor offset(%u) and the first block of donor " + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); + *err = -EIO; + goto out; + } + + /* Set donor extent to orig extent */ + *err = mext_leaf_block(handle, orig_inode, + orig_path, &tmp_dext, &orig_off); + if (*err) + goto out; + + /* Set orig extent to donor extent */ + *err = mext_leaf_block(handle, donor_inode, + donor_path, &tmp_oext, &donor_off); + if (*err) + goto out; + + dext_alen = ext4_ext_get_actual_len(&tmp_dext); + replaced_count += dext_alen; + donor_off += dext_alen; + orig_off += dext_alen; + + /* Already moved the expected blocks */ + if (replaced_count >= count) + break; + + if (orig_path) + ext4_ext_drop_refs(orig_path); + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + if (donor_path) + ext4_ext_drop_refs(donor_path); + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) + goto out; + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); + if (*err) + goto out; + } + +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (donor_path) { + ext4_ext_drop_refs(donor_path); + kfree(donor_path); + } + + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + + return replaced_count; +} + +/** + * move_extent_per_page - Move extent data per page + * + * @o_filp: file structure of original file + * @donor_inode: donor inode + * @orig_page_offset: page index on original file + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling mext_replace_branches(). + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. + */ +static int +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, int data_offset_in_page, + int block_len_in_page, int uninit, int *err) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct address_space *mapping = orig_inode->i_mapping; + struct buffer_head *bh; + struct page *page = NULL; + const struct address_space_operations *a_ops = mapping->a_ops; + handle_t *handle; + ext4_lblk_t orig_blk_offset; + long long offs = orig_page_offset << PAGE_CACHE_SHIFT; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int w_flags = 0; + unsigned int tmp_data_size, data_size, replaced_size; + void *fsdata; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and donor_inode may change each different metadata blocks. + */ + jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + handle = ext4_journal_start(orig_inode, jblocks); + if (IS_ERR(handle)) { + *err = PTR_ERR(handle); + return 0; + } + + if (segment_eq(get_fs(), KERNEL_DS)) + w_flags |= AOP_FLAG_UNINTERRUPTIBLE; + + orig_blk_offset = orig_page_offset * blocks_per_page + + data_offset_in_page; + + /* + * If orig extent is uninitialized one, + * it's not necessary force the page into memory + * and then force it to be written out again. + * Just swap data blocks between orig and donor. + */ + if (uninit) { + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); + goto out2; + } + + offs = (long long)orig_blk_offset << orig_inode->i_blkbits; + + /* Calculate data_size */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ + tmp_data_size = orig_inode->i_size & (blocksize - 1); + /* + * If data_size equal zero, it shows data_size is multiples of + * blocksize. So we set appropriate value. + */ + if (tmp_data_size == 0) + tmp_data_size = blocksize; + + data_size = tmp_data_size + + ((block_len_in_page - 1) << orig_inode->i_blkbits); + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; + + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, + &page, &fsdata); + if (unlikely(*err < 0)) + goto out; + + if (!PageUptodate(page)) { + mapping->a_ops->readpage(o_filp, page); + lock_page(page); + } + + /* + * try_to_release_page() doesn't call releasepage in writeback mode. + * We should care about the order of writing to the same file + * by multiple move extent processes. + * It needs to call wait_on_page_writeback() to wait for the + * writeback of the page. + */ + wait_on_page_writeback(page); + + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); + + bh = page_buffers(page); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, + (sector_t)(orig_blk_offset + i), bh, 0); + if (*err < 0) + goto out; + + if (bh->b_this_page != NULL) + bh = bh->b_this_page; + } + + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, + page, fsdata); + page = NULL; + +out: + if (unlikely(page)) { + if (PageLocked(page)) + unlock_page(page); + page_cache_release(page); + ext4_journal_stop(handle); + } +out2: + ext4_journal_stop(handle); + + if (err2) + *err = err2; + + return replaced_count; +} + +/** + * mext_check_arguments - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_check_arguments(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be swapfile [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Files should be in the same ext4 FS */ + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; + } + + if ((!orig_inode->i_size) || (!donor_inode->i_size)) { + ext4_debug("ext4 move extent: File size is 0 byte\n"); + return -EINVAL; + } + + /* Start offset should be same */ + if (orig_start != donor_start) { + ext4_debug("ext4 move extent: orig and donor's start " + "offset are not same [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_inode->i_size > donor_inode->i_size) { + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; + /* TODO: eliminate this artificial restriction */ + if (orig_start >= donor_blocks) { + ext4_debug("ext4 move extent: orig start offset " + "[%llu] should be less than donor file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, donor_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* TODO: eliminate this artificial restriction */ + if (orig_start + *len > donor_blocks) { + ext4_debug("ext4 move extent: End offset [%llu] should " + "be less than donor file blocks [%u]." + "So adjust length from %llu to %llu " + "[ino:orig %lu, donor %lu]\n", + orig_start + *len, donor_blocks, + *len, donor_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = donor_blocks - orig_start; + } + } else { + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; + if (orig_start >= orig_blocks) { + ext4_debug("ext4 move extent: start offset [%llu] " + "should be less than original file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, orig_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_start + *len > orig_blocks) { + ext4_debug("ext4 move extent: Adjust length " + "from %llu to %llu. Because it should be " + "less than original file blocks " + "[ino:orig %lu, donor %lu]\n", + *len, orig_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = orig_blocks - orig_start; + } + } + + if (!*len) { + ext4_debug("ext4 move extent: len should not be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + return 0; +} + +/** + * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * + * Lock two inodes' i_mutex by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +{ + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); + if (ret < 0) + goto out; + + if (inode1 == inode2) { + mutex_lock(&inode1->i_mutex); + goto out; + } + + if (inode1->i_ino < inode2->i_ino) { + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); + } + +out: + return ret; +} + +/** + * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode that is released first + * @inode2: the inode that is released second + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ + +static int +mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +{ + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); + if (ret < 0) + goto out; + + if (inode1) + mutex_unlock(&inode1->i_mutex); + + if (inode2 && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); + +out: + return ret; +} + +/** + * ext4_move_extents - Exchange the specified range of a file + * + * @o_filp: file structure of the original file + * @d_filp: file structure of the donor file + * @orig_start: start offset in block for orig + * @donor_start: start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * This function returns 0 and moved block length is set in moved_len + * if succeed, otherwise returns error value. + * + * Note: ext4_move_extents() proceeds the following order. + * 1:ext4_move_extents() calculates the last block number of moving extent + * function by the start block number (orig_start) and the number of blocks + * to be moved (len) specified as arguments. + * If the {orig, donor}_start points a hole, the extent's start offset + * pointed by ext_cur (current extent), holecheck_path, orig_path are set + * after hole behind. + * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent + * or the ext_cur exceeds the block_end which is last logical block number. + * 3:To get the length of continues area, call mext_next_extent() + * specified with the ext_cur (initial value is holecheck_path) re-cursive, + * until find un-continuous extent, the start logical block number exceeds + * the block_end or the extent points to the last extent. + * 4:Exchange the original inode data with donor inode data + * from orig_page_offset to seq_end_page. + * The start indexes of data are specified as arguments. + * That of the original inode is orig_page_offset, + * and the donor inode is also orig_page_offset + * (To easily handle blocksize != pagesize case, the offset for the + * donor inode is block unit). + * 5:Update holecheck_path and orig_path to points a next proceeding extent, + * then returns to step 2. + * 6:Release holecheck_path, orig_path and set the len to moved_len + * which shows the number of moved blocks. + * The moved_len is useful for the command to calculate the file offset + * for starting next move extent ioctl. + * 7:Return 0 on success, or a negative error value on failure. + */ +int +ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 orig_start, __u64 donor_start, __u64 len, + __u64 *moved_len) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct inode *donor_inode = d_filp->f_dentry->d_inode; + struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; + struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; + ext4_lblk_t block_start = orig_start; + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + ext4_lblk_t rest_blocks; + pgoff_t orig_page_offset = 0, seq_end_page; + int ret1, ret2, depth, last_extent = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int data_offset_in_page; + int block_len_in_page; + int uninit; + + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Protect orig and donor inodes against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; + + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len); + if (ret1) + goto out; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; + block_end = block_start + len - 1; + if (file_end < block_end) + len -= block_end - file_end; + + ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (ret1) + goto out; + + /* Get path structure to check the hole */ + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret1) + goto out; + + depth = ext_depth(orig_inode); + ext_cur = holecheck_path[depth].p_ext; + + /* + * Get proper starting location of block replacement if block_start was + * within the hole. + */ + if (le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + /* + * The hole exists between extents or the tail of + * original file. + */ + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { + ret1 = last_extent; + goto out; + } + last_extent = mext_next_extent(orig_inode, orig_path, + &ext_dummy); + if (last_extent < 0) { + ret1 = last_extent; + goto out; + } + seq_start = le32_to_cpu(ext_cur->ee_block); + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) + /* The hole exists at the beginning of original file. */ + seq_start = le32_to_cpu(ext_cur->ee_block); + else + seq_start = block_start; + + /* No blocks within the specified range. */ + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + ext4_debug("ext4 move extent: The specified range of file " + "may be the hole\n"); + ret1 = -EINVAL; + goto out; + } + + /* Adjust start blocks */ + add_blocks = min(le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + + /* Adjust tail blocks */ + if (seq_start + seq_blocks - 1 > block_end) + seq_blocks = block_end - seq_start + 1; + + ext_prev = ext_cur; + last_extent = mext_next_extent(orig_inode, holecheck_path, + &ext_cur); + if (last_extent < 0) { + ret1 = last_extent; + break; + } + add_blocks = ext4_ext_get_actual_len(ext_cur); + + /* + * Extend the length of contiguous block (seq_blocks) + * if extents are contiguous. + */ + if (ext4_can_extents_be_merged(orig_inode, + ext_prev, ext_cur) && + block_end >= le32_to_cpu(ext_cur->ee_block) && + !last_extent) + continue; + + /* Is original extent is uninitialized */ + uninit = ext4_ext_is_uninitialized(ext_prev); + + data_offset_in_page = seq_start % blocks_per_page; + + /* + * Calculate data blocks count that should be swapped + * at the first page. + */ + if (data_offset_in_page + seq_blocks > blocks_per_page) { + /* Swapped blocks are across pages */ + block_len_in_page = + blocks_per_page - data_offset_in_page; + } else { + /* Swapped blocks are in a page */ + block_len_in_page = seq_blocks; + } + + orig_page_offset = seq_start >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_end_page = (seq_start + seq_blocks - 1) >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_start = le32_to_cpu(ext_cur->ee_block); + rest_blocks = seq_blocks; + + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); + + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit, + &ret1); + + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; + if (ret1 < 0) + break; + if (*moved_len > len) { + EXT4_ERROR_INODE(orig_inode, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; + break; + } + + orig_page_offset++; + data_offset_in_page = 0; + rest_blocks -= block_len_in_page; + if (rest_blocks > blocks_per_page) + block_len_in_page = blocks_per_page; + else + block_len_in_page = rest_blocks; + } + + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret1) + break; + depth = holecheck_path->p_depth; + + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret1) + break; + + ext_cur = holecheck_path[depth].p_ext; + add_blocks = ext4_ext_get_actual_len(ext_cur); + seq_blocks = 0; + + } +out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (holecheck_path) { + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } + double_up_write_data_sem(orig_inode, donor_inode); + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + + if (ret1) + return ret1; + else if (ret2) + return ret2; + + return 0; +} diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c new file mode 100644 index 00000000..0a94cbbe --- /dev/null +++ b/fs/ext4/namei.c @@ -0,0 +1,2607 @@ +/* + * linux/fs/ext4/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * Hash Tree Directory indexing (c) + * Daniel Phillips, 2001 + * Hash Tree Directory indexing porting + * Christopher Li, 2002 + * Hash Tree Directory indexing cleanup + * Theodore Ts'o, 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" +#include "ext4_jbd2.h" + +#include "xattr.h" +#include "acl.h" + +#include +/* + * define how far ahead to read directories while searching them. + */ +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block, int *err) +{ + struct buffer_head *bh; + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, 1, err); + if (bh) { + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + *err = ext4_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } + } + return bh; +} + +#ifndef assert +#define assert(test) J_ASSERT(test) +#endif + +#ifdef DX_DEBUG +#define dxtrace(command) command +#else +#define dxtrace(command) +#endif + +struct fake_dirent +{ + __le32 inode; + __le16 rec_len; + u8 name_len; + u8 file_type; +}; + +struct dx_countlimit +{ + __le16 limit; + __le16 count; +}; + +struct dx_entry +{ + __le32 hash; + __le32 block; +}; + +/* + * dx_root_info is laid out so that if it should somehow get overlaid by a + * dirent the two low bits of the hash version will be zero. Therefore, the + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +struct dx_root +{ + struct fake_dirent dot; + char dot_name[4]; + struct fake_dirent dotdot; + char dotdot_name[4]; + struct dx_root_info + { + __le32 reserved_zero; + u8 hash_version; + u8 info_length; /* 8 */ + u8 indirect_levels; + u8 unused_flags; + } + info; + struct dx_entry entries[0]; +}; + +struct dx_node +{ + struct fake_dirent fake; + struct dx_entry entries[0]; +}; + + +struct dx_frame +{ + struct buffer_head *bh; + struct dx_entry *entries; + struct dx_entry *at; +}; + +struct dx_map_entry +{ + u32 hash; + u16 offs; + u16 size; +}; + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); +static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); +static inline unsigned dx_get_hash(struct dx_entry *entry); +static void dx_set_hash(struct dx_entry *entry, unsigned value); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); +static void dx_set_limit(struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +static unsigned dx_node_limit(struct inode *dir); +static struct dx_frame *dx_probe(const struct qstr *d_name, + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, + int *err); +static void dx_release(struct dx_frame *frames); +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_sort_map(struct dx_map_entry *map, unsigned count); +static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, + struct dx_map_entry *offsets, int count, unsigned blocksize); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); +static void dx_insert_block(struct dx_frame *frame, + u32 hash, ext4_lblk_t block); +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash); +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *err); +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); + +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ext4_dir_entry_2 * +ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) +{ + return (struct ext4_dir_entry_2 *)((char *)p + + ext4_rec_len_from_disk(p->rec_len, blocksize)); +} + +/* + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ + +static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) +{ + return le32_to_cpu(entry->block) & 0x00ffffff; +} + +static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +{ + entry->block = cpu_to_le32(value); +} + +static inline unsigned dx_get_hash(struct dx_entry *entry) +{ + return le32_to_cpu(entry->hash); +} + +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) +{ + entry->hash = cpu_to_le32(value); +} + +static inline unsigned dx_get_count(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->count); +} + +static inline unsigned dx_get_limit(struct dx_entry *entries) +{ + return le16_to_cpu(((struct dx_countlimit *) entries)->limit); +} + +static inline void dx_set_count(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); +} + +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) +{ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); +} + +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - + EXT4_DIR_REC_LEN(2) - infosize; + return entry_space / sizeof(struct dx_entry); +} + +static inline unsigned dx_node_limit(struct inode *dir) +{ + unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + return entry_space / sizeof(struct dx_entry); +} + +/* + * Debug + */ +#ifdef DX_DEBUG +static void dx_show_index(char * label, struct dx_entry *entries) +{ + int i, n = dx_get_count (entries); + printk(KERN_DEBUG "%s index ", label); + for (i = 0; i < n; i++) { + printk("%x->%lu ", i ? dx_get_hash(entries + i) : + 0, (unsigned long)dx_get_block(entries + i)); + } + printk("\n"); +} + +struct stats +{ + unsigned names; + unsigned space; + unsigned bcount; +}; + +static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, + int size, int show_names) +{ + unsigned names = 0, space = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + printk("names: "); + while ((char *) de < base + size) + { + if (de->inode) + { + if (show_names) + { + int len = de->name_len; + char *name = de->name; + while (len--) printk("%c", *name++); + ext4fs_dirhash(de->name, de->name_len, &h); + printk(":%x.%u ", h.hash, + (unsigned) ((char *) de - base)); + } + space += EXT4_DIR_REC_LEN(de->name_len); + names++; + } + de = ext4_next_entry(de, size); + } + printk("(%i)\n", names); + return (struct stats) { names, space, 1 }; +} + +struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + struct dx_entry *entries, int levels) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; + unsigned bcount = 0; + struct buffer_head *bh; + int err; + printk("%i indexed blocks...\n", count); + for (i = 0; i < count; i++, entries++) + { + ext4_lblk_t block = dx_get_block(entries); + ext4_lblk_t hash = i ? dx_get_hash(entries): 0; + u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; + struct stats stats; + printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); + if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; + stats = levels? + dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): + dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); + names += stats.names; + space += stats.space; + bcount += stats.bcount; + brelse(bh); + } + if (bcount) + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); + return (struct stats) { names, space, bcount}; +} +#endif /* DX_DEBUG */ + +/* + * Probe for a directory leaf block to search. + * + * dx_probe can return ERR_BAD_DX_DIR, which means there was a format + * error in the directory index, and the caller should fall back to + * searching the directory normally. The callers of dx_probe **MUST** + * check for this error code, and make sure it never gets reflected + * back to userspace. + */ +static struct dx_frame * +dx_probe(const struct qstr *d_name, struct inode *dir, + struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) +{ + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; + struct dx_root *root; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + + frame->bh = NULL; + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + root = (struct dx_root *) bh->b_data; + if (root->info.hash_version != DX_HASH_TEA && + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", + root->info.hash_version); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (d_name) + ext4fs_dirhash(d_name->name, d_name->len, hinfo); + hash = hinfo->hash; + + if (root->info.unused_flags & 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", + root->info.unused_flags); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + if ((indirect = root->info.indirect_levels) > 1) { + ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", + root->info.indirect_levels); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + entries = (struct dx_entry *) (((char *)&root->info) + + root->info.info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, + root->info.info_length)) { + ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + + dxtrace(printk("Look up %x", hash)); + while (1) + { + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + + p = entries + 1; + q = entries + count - 1; + while (p <= q) + { + m = p + (q - p)/2; + dxtrace(printk(".")); + if (dx_get_hash(m) > hash) + q = m - 1; + else + p = m + 1; + } + + if (0) // linear search cross check + { + unsigned n = count - 1; + at = entries; + while (n--) + { + dxtrace(printk(",")); + if (dx_get_hash(++at) > hash) + { + at--; + break; + } + } + assert (at == p - 1); + } + + at = p - 1; + dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + frame->bh = bh; + frame->entries = entries; + frame->at = at; + if (!indirect--) return frame; + if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext4_warning(dir->i_sb, + "dx entry: limit != node limit"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail2; + } + frame++; + frame->bh = NULL; + } +fail2: + while (frame >= frame_in) { + brelse(frame->bh); + frame--; + } +fail: + if (*err == ERR_BAD_DX_DIR) + ext4_warning(dir->i_sb, + "Corrupt dir inode %lu, running e2fsck is " + "recommended.", dir->i_ino); + return NULL; +} + +static void dx_release (struct dx_frame *frames) +{ + if (frames[0].bh == NULL) + return; + + if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + brelse(frames[1].bh); + brelse(frames[0].bh); +} + +/* + * This function increments the frame pointer to search the next leaf + * block, and reads in the necessary intervening nodes if the search + * should be necessary. Whether or not the search is necessary is + * controlled by the hash parameter. If the hash value is even, then + * the search is only continued if the next block starts with that + * hash value. This is used if we are searching for a specific file. + * + * If the hash value is HASH_NB_ALWAYS, then always go to the next block. + * + * This function returns 1 if the caller should continue to search, + * or 0 if it should not. If there is an error reading one of the + * index blocks, it will a negative error code. + * + * If start_hash is non-null, it will be filled in with the starting + * hash of the next page. + */ +static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, + __u32 *start_hash) +{ + struct dx_frame *p; + struct buffer_head *bh; + int err, num_frames = 0; + __u32 bhash; + + p = frame; + /* + * Find the next leaf page by incrementing the frame pointer. + * If we run out of entries in the interior node, loop around and + * increment pointer in the parent node. When we break out of + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ + while (1) { + if (++(p->at) < p->entries + dx_get_count(p->entries)) + break; + if (p == frames) + return 0; + num_frames++; + p--; + } + + /* + * If the hash is 1, then continue only if the next page has a + * continuation hash of any value. This is used for readdir + * handling. Otherwise, check to see if the hash matches the + * desired contiuation hash. If it doesn't, return since + * there's no point to read in the successive index pages. + */ + bhash = dx_get_hash(p->at); + if (start_hash) + *start_hash = bhash; + if ((hash & 1) == 0) { + if ((bhash & ~1) != hash) + return 0; + } + /* + * If the hash is HASH_NB_ALWAYS, we always go to the next + * block so no check is necessary + */ + while (num_frames--) { + if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ + p++; + brelse(p->bh); + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } + return 1; +} + + +/* + * This function fills a red-black tree with information from a + * directory block. It returns the number directory entries loaded + * into the tree. If there is an error it is returned in err. + */ +static int htree_dirblock_to_tree(struct file *dir_file, + struct inode *dir, ext4_lblk_t block, + struct dx_hash_info *hinfo, + __u32 start_hash, __u32 start_minor_hash) +{ + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *top; + int err, count = 0; + + dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", + (unsigned long)block)); + if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) + return err; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - + EXT4_DIR_REC_LEN(0)); + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + (block<i_sb)) + + ((char *)de - bh->b_data))) { + /* On error, skip the f_pos to the next block. */ + dir_file->f_pos = (dir_file->f_pos | + (dir->i_sb->s_blocksize - 1)) + 1; + brelse(bh); + return count; + } + ext4fs_dirhash(de->name, de->name_len, hinfo); + if ((hinfo->hash < start_hash) || + ((hinfo->hash == start_hash) && + (hinfo->minor_hash < start_minor_hash))) + continue; + if (de->inode == 0) + continue; + if ((err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de)) != 0) { + brelse(bh); + return err; + } + count++; + } + brelse(bh); + return count; +} + + +/* + * This function fills a red-black tree with information from a + * directory. We start scanning the directory in hash order, starting + * at start_hash and start_minor_hash. + * + * This function returns the number of entries inserted into the tree, + * or a negative error code. + */ +int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash) +{ + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; + struct dx_frame frames[2], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; + int ret, err; + __u32 hashval; + + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); + dir = dir_file->f_path.dentry->d_inode; + if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { + hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); + *next_hash = ~0; + return count; + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; + frame = dx_probe(NULL, dir, &hinfo, frames, &err); + if (!frame) + return err; + + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) + goto errout; + count++; + } + if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; + de = ext4_next_entry(de, dir->i_sb->s_blocksize); + if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) + goto errout; + count++; + } + + while (1) { + block = dx_get_block(frame->at); + ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, + start_hash, start_minor_hash); + if (ret < 0) { + err = ret; + goto errout; + } + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, + frame, frames, &hashval); + *next_hash = hashval; + if (ret < 0) { + err = ret; + goto errout; + } + /* + * Stop if: (a) there are no more entries, or + * (b) we have inserted at least one entry and the + * next hash value is not a continuation + */ + if ((ret == 0) || + (count && ((hashval & 1) == 0))) + break; + } + dx_release(frames); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); + return count; +errout: + dx_release(frames); + return (err); +} + + +/* + * Directory block splitting, compacting + */ + +/* + * Create map of hash values, offsets, and sizes, stored at end of block. + * Returns number of entries mapped. + */ +static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail) +{ + int count = 0; + char *base = (char *) de; + struct dx_hash_info h = *hinfo; + + while ((char *) de < base + blocksize) { + if (de->name_len && de->inode) { + ext4fs_dirhash(de->name, de->name_len, &h); + map_tail--; + map_tail->hash = h.hash; + map_tail->offs = ((char *) de - base)>>2; + map_tail->size = le16_to_cpu(de->rec_len); + count++; + cond_resched(); + } + /* XXX: do we need to check rec_len == 0 case? -Chris */ + de = ext4_next_entry(de, blocksize); + } + return count; +} + +/* Sort map by hash value */ +static void dx_sort_map (struct dx_map_entry *map, unsigned count) +{ + struct dx_map_entry *p, *q, *top = map + count - 1; + int more; + /* Combsort until bubble sort doesn't suck */ + while (count > 2) { + count = count*10/13; + if (count - 9 < 2) /* 9, 10 -> 11 */ + count = 11; + for (p = top, q = p - count; q >= map; p--, q--) + if (p->hash < q->hash) + swap(*p, *q); + } + /* Garden variety bubble sort */ + do { + more = 0; + q = top; + while (q-- > map) { + if (q[1].hash >= q[0].hash) + continue; + swap(*(q+1), *q); + more = 1; + } + } while(more); +} + +static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) +{ + struct dx_entry *entries = frame->entries; + struct dx_entry *old = frame->at, *new = old + 1; + int count = dx_get_count(entries); + + assert(count < dx_get_limit(entries)); + assert(old < entries + count); + memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); + dx_set_hash(new, hash); + dx_set_block(new, block); + dx_set_count(entries, count + 1); +} + +static void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} + +/* + * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * + * `len <= EXT4_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ +static inline int ext4_match (int len, const char * const name, + struct ext4_dir_entry_2 * de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 ** res_dir) +{ + struct ext4_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = d_name->name; + int namelen = d_name->len; + + de = (struct ext4_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext4_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } + return 0; +} + + +/* + * ext4_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +static struct buffer_head * ext4_find_entry (struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 ** res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + ext4_lblk_t start, block, b; + const u8 *name = d_name->name; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + ext4_lblk_t nblocks; + int i, err; + int namelen; + + *res_dir = NULL; + sb = dir->i_sb; + namelen = d_name->len; + if (namelen > EXT4_NAME_LEN) + return NULL; + if ((namelen <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) { + /* + * "." or ".." will only be in the first block + * NFS may look up ".."; "." should be handled by the VFS + */ + block = start = 0; + nblocks = 1; + goto restart; + } + if (is_dx(dir)) { + bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the + * old fashioned way. + */ + if (bh || (err != ERR_BAD_DX_DIR)) + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext4_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ | REQ_META | REQ_PRIO, + 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + EXT4_ERROR_INODE(dir, "reading directory lblock %lu", + (unsigned long) block); + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + EXT4_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + return ret; +} + +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, int *err) +{ + struct super_block * sb = dir->i_sb; + struct dx_hash_info hinfo; + struct dx_frame frames[2], *frame; + struct buffer_head *bh; + ext4_lblk_t block; + int retval; + + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) + return NULL; + do { + block = dx_get_block(frame->at); + if (!(bh = ext4_bread(NULL, dir, block, 0, err))) + goto errout; + + retval = search_dirblock(bh, dir, d_name, + block << EXT4_BLOCK_SIZE_BITS(sb), + res_dir); + if (retval == 1) { /* Success! */ + dx_release(frames); + return bh; + } + brelse(bh); + if (retval == -1) { + *err = ERR_BAD_DX_DIR; + goto errout; + } + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hinfo.hash, frame, + frames, NULL); + if (retval < 0) { + ext4_warning(sb, + "error reading index page in directory #%lu", + dir->i_ino); + *err = retval; + goto errout; + } + } while (retval == 1); + + *err = -ENOENT; +errout: + dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); + dx_release (frames); + return NULL; +} + +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; + + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext4_find_entry(dir, &dentry->d_name, &de); + inode = NULL; + if (bh) { + __u32 ino = le32_to_cpu(de->inode); + brelse(bh); + if (!ext4_valid_inum(dir->i_sb, ino)) { + EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); + return ERR_PTR(-EIO); + } + if (unlikely(ino == dir->i_ino)) { + EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", + dentry->d_name.len, + dentry->d_name.name); + return ERR_PTR(-EIO); + } + inode = ext4_iget(dir->i_sb, ino); + if (inode == ERR_PTR(-ESTALE)) { + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EIO); + } + } + return d_splice_alias(inode, dentry); +} + + +struct dentry *ext4_get_parent(struct dentry *child) +{ + __u32 ino; + static const struct qstr dotdot = { + .name = "..", + .len = 2, + }; + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + + bh = ext4_find_entry(child->d_inode, &dotdot, &de); + if (!bh) + return ERR_PTR(-ENOENT); + ino = le32_to_cpu(de->inode); + brelse(bh); + + if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { + EXT4_ERROR_INODE(child->d_inode, + "bad parent inode number: %u", ino); + return ERR_PTR(-EIO); + } + + return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); +} + +#define S_SHIFT 12 +static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, + [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, + [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, +}; + +static inline void ext4_set_de_type(struct super_block *sb, + struct ext4_dir_entry_2 *de, + umode_t mode) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +/* + * Move count entries from end of map between two memory locations. + * Returns pointer to last entry moved. + */ +static struct ext4_dir_entry_2 * +dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, + unsigned blocksize) +{ + unsigned rec_len = 0; + + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); + rec_len = EXT4_DIR_REC_LEN(de->name_len); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); + de->inode = 0; + map++; + to += rec_len; + } + return (struct ext4_dir_entry_2 *) (to - rec_len); +} + +/* + * Compact each dir entry in the range to the minimal rec_len. + * Returns pointer to last entry in range. + */ +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +{ + struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; + unsigned rec_len = 0; + + prev = to = de; + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { + rec_len = EXT4_DIR_REC_LEN(de->name_len); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + prev = to; + to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); + } + de = next; + } + return prev; +} + +/* + * Split a full leaf block to make room for a new dir entry. + * Allocate a new block, and move entries so that they are approx. equally full. + * Returns pointer to de in block into which the new entry will be inserted. + */ +static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + struct buffer_head **bh,struct dx_frame *frame, + struct dx_hash_info *hinfo, int *error) +{ + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; + struct buffer_head *bh2; + ext4_lblk_t newblock; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2; + unsigned split, move, size; + struct ext4_dir_entry_2 *de = NULL, *de2; + int err = 0, i; + + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) { + brelse(*bh); + *bh = NULL; + goto errout; + } + + BUFFER_TRACE(*bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, *bh); + if (err) + goto journal_error; + + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + + data2 = bh2->b_data; + + /* create map in the end of data2 block */ + map = (struct dx_map_entry *) (data2 + blocksize); + count = dx_make_map((struct ext4_dir_entry_2 *) data1, + blocksize, hinfo, map); + map -= count; + dx_sort_map(map, count); + /* Split the existing block in the middle, size-wise */ + size = 0; + move = 0; + for (i = count-1; i >= 0; i--) { + /* is more than half of this entry in 2nd half of the block? */ + if (size + map[i].size/2 > blocksize/2) + break; + size += map[i].size; + move++; + } + /* map index at which we will split */ + split = count - move; + hash2 = map[split].hash; + continued = hash2 == map[split - 1].hash; + dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", + (unsigned long)dx_get_block(frame->at), + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + blocksize); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + + /* Which block gets the new entry? */ + if (hinfo->hash >= hash2) + { + swap(*bh, bh2); + de = de2; + } + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + err = ext4_handle_dirty_metadata(handle, dir, frame->bh); + if (err) + goto journal_error; + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); + return de; + +journal_error: + brelse(*bh); + brelse(bh2); + *bh = NULL; + ext4_std_error(dir->i_sb, err); +errout: + *error = err; + return NULL; +} + +/* + * Add a new entry into a directory (leaf) block. If de is non-NULL, + * it points to a directory entry which is guaranteed to be large + * enough for new directory entry. If de is NULL, then + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext4_dir_entry_2 *de, + struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned int offset = 0; + unsigned int blocksize = dir->i_sb->s_blocksize; + unsigned short reclen; + int nlen, rlen, err; + char *top; + + reclen = EXT4_DIR_REC_LEN(namelen); + if (!de) { + de = (struct ext4_dir_entry_2 *)bh->b_data; + top = bh->b_data + blocksize - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if ((de->inode? rlen - nlen: rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + } + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_std_error(dir->i_sb, err); + return err; + } + + /* By now the buffer is marked for journaling */ + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); + de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (err) + ext4_std_error(dir->i_sb, err); + return 0; +} + +/* + * This converts a one block unindexed directory to a 3 block indexed + * directory, and adds the dentry to the indexed directory. + */ +static int make_indexed_dir(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct buffer_head *bh) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; + struct dx_root *root; + struct dx_frame frames[2], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + struct dx_hash_info hinfo; + ext4_lblk_t block; + struct fake_dirent *fde; + + blocksize = dir->i_sb->s_blocksize; + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + retval = ext4_journal_get_write_access(handle, bh); + if (retval) { + ext4_std_error(dir->i_sb, retval); + brelse(bh); + return retval; + } + root = (struct dx_root *) bh->b_data; + + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len, blocksize)); + if ((char *) de >= (((char *) root) + blocksize)) { + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block, &retval); + if (!(bh2)) { + brelse(bh); + return retval; + } + ext4_set_inode_flag(dir, EXT4_INODE_INDEX); + data1 = bh2->b_data; + + memcpy (data1, de, len); + de = (struct ext4_dir_entry_2 *) data1; + top = data1 + len; + while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) + de = de2; + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); + /* Initialize the root; the dot dirents already exist */ + de = (struct ext4_dir_entry_2 *) (&root->dotdot); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), + blocksize); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(name, namelen, &hinfo); + frame = frames; + frame->entries = entries; + frame->at = entries; + frame->bh = bh; + bh = bh2; + + ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_metadata(handle, dir, bh); + + de = do_split(handle,dir, &bh, frame, &hinfo, &retval); + if (!de) { + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); + return retval; + } + dx_release(frames); + + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + return retval; +} + +/* + * ext4_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext4_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + int retval; + int dx_fallback=0; + unsigned blocksize; + ext4_lblk_t block, blocks; + + sb = dir->i_sb; + blocksize = sb->s_blocksize; + if (!dentry->d_name.len) + return -EINVAL; + if (is_dx(dir)) { + retval = ext4_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); + } + blocks = dir->i_size >> sb->s_blocksize_bits; + for (block = 0; block < blocks; block++) { + bh = ext4_bread(handle, dir, block, 0, &retval); + if(!bh) + return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (retval != -ENOSPC) { + brelse(bh); + return retval; + } + + if (blocks == 1 && !dx_fallback && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); + brelse(bh); + } + bh = ext4_append(handle, dir, &block, &retval); + if (!bh) + return retval; + de = (struct ext4_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); + if (retval == 0) + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; +} + +/* + * Returns 0 for success, or a negative error value + */ +static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + struct dx_frame frames[2], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; + int err; + + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); + if (!frame) + return err; + entries = frame->entries; + at = frame->at; + + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; + unsigned icount = dx_get_count(entries); + int levels = frame - frames; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + + if (levels && (dx_get_count(frames->entries) == + dx_get_limit(frames->entries))) { + ext4_warning(sb, "Directory index full!"); + err = -ENOSPC; + goto cleanup; + } + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; + if (levels) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, + frames[0].bh); + if (err) + goto journal_error; + + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Which index block gets the new entry? */ + if (at - entries >= icount1) { + frame->at = at = at - entries - icount1 + entries2; + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } + dx_insert_block(frames + 0, hash2, newblock); + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; + brelse (bh2); + } else { + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); + + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); + ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; + + /* Add new access path frame */ + frame = frames + 1; + frame->at = at = at - entries + entries2; + frame->entries = entries = entries2; + frame->bh = bh2; + err = ext4_journal_get_write_access(handle, + frame->bh); + if (err) + goto journal_error; + } + err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + if (err) { + ext4_std_error(inode->i_sb, err); + goto cleanup; + } + } + de = do_split(handle, dir, &bh, frame, &hinfo, &err); + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup; + +journal_error: + ext4_std_error(dir->i_sb, err); +cleanup: + if (bh) + brelse(bh); + dx_release(frames); + return err; +} + +/* + * ext4_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + struct ext4_dir_entry_2 *de, *pde; + unsigned int blocksize = dir->i_sb->s_blocksize; + int i, err; + + i = 0; + pde = NULL; + de = (struct ext4_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, i)) + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } + if (pde) + pde->rec_len = ext4_rec_len_to_disk( + ext4_rec_len_from_disk(pde->rec_len, + blocksize) + + ext4_rec_len_from_disk(de->rec_len, + blocksize), + blocksize); + else + de->inode = 0; + dir->i_version++; + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (unlikely(err)) { + ext4_std_error(dir->i_sb, err); + return err; + } + return 0; + } + i += ext4_rec_len_from_disk(de->rec_len, blocksize); + pde = de; + de = ext4_next_entry(de, blocksize); + } + return -ENOENT; +} + +/* + * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, + * since this indicates that nlinks count was previously 1. + */ +static void ext4_inc_count(handle_t *handle, struct inode *inode) +{ + inc_nlink(inode); + if (is_dx(inode) && inode->i_nlink > 1) { + /* limit is 16-bit i_links_count */ + if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { + set_nlink(inode, 1); + EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + } + } +} + +/* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +static void ext4_dec_count(handle_t *handle, struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); +} + + +static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) +{ + int err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + unlock_new_inode(inode); + return 0; + } + drop_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return err; +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, + struct nameidata *nd) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext4_file_inode_operations; + inode->i_fop = &ext4_file_operations; + ext4_set_aops(inode); + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); +#ifdef CONFIG_EXT4_FS_XATTR + inode->i_op = &ext4_special_inode_operations; +#endif + err = ext4_add_nondir(handle, dentry, inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + unsigned int blocksize = dir->i_sb->s_blocksize; + int err, retries = 0; + + if (EXT4_DIR_LINK_MAX(dir)) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, S_IFDIR | mode, + &dentry->d_name, 0, NULL); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; + inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!dir_block) + goto out_clear_inode; + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_clear_inode; + de = (struct ext4_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), + blocksize); + de->name_len = 2; + strcpy(de->name, ".."); + ext4_set_de_type(dir->i_sb, de, S_IFDIR); + set_nlink(inode, 2); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, dir_block); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); + if (!err) + err = ext4_add_entry(handle, dentry, inode); + if (err) { +out_clear_inode: + clear_nlink(inode); + unlock_new_inode(inode); + ext4_mark_inode_dirty(handle, inode); + iput(inode); + goto out_stop; + } + ext4_inc_count(handle, dir); + ext4_update_dx_flag(dir); + err = ext4_mark_inode_dirty(handle, dir); + if (err) + goto out_clear_inode; + d_instantiate(dentry, inode); + unlock_new_inode(inode); +out_stop: + brelse(dir_block); + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +static int empty_dir(struct inode *inode) +{ + unsigned int offset; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *de1; + struct super_block *sb; + int err = 0; + + sb = inode->i_sb; + if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { + if (err) + EXT4_ERROR_INODE(inode, + "error %d reading directory lblock 0", err); + else + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + de1 = ext4_next_entry(de, sb->s_blocksize); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp(".", de->name) || + strcmp("..", de1->name)) { + ext4_warning(inode->i_sb, + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse(bh); + return 1; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); + de = ext4_next_entry(de1, sb->s_blocksize); + while (offset < inode->i_size) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + unsigned int lblock; + err = 0; + brelse(bh); + lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); + bh = ext4_bread(NULL, inode, lblock, 0, &err); + if (!bh) { + if (err) + EXT4_ERROR_INODE(inode, + "error %d reading directory " + "lblock %u", err, lblock); + offset += sb->s_blocksize; + continue; + } + de = (struct ext4_dir_entry_2 *) bh->b_data; + } + if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { + de = (struct ext4_dir_entry_2 *)(bh->b_data + + sb->s_blocksize); + offset = (offset | (sb->s_blocksize - 1)) + 1; + continue; + } + if (le32_to_cpu(de->inode)) { + brelse(bh); + return 0; + } + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + } + brelse(bh); + return 1; +} + +/* ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_iloc iloc; + int err = 0, rc; + + if (!ext4_handle_valid(handle)) + return 0; + + mutex_lock(&EXT4_SB(sb)->s_orphan_lock); + if (!list_empty(&EXT4_I(inode)->i_orphan)) + goto out_unlock; + + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race + */ + J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_unlock; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= + (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) + goto mem_insert; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); + EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ +mem_insert: + if (!err) + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out_unlock: + mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); + ext4_std_error(inode->i_sb, err); + return err; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi; + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + /* ext4_handle_valid() assumes a valid handle_t pointer */ + if (handle && !ext4_handle_valid(handle)) + return 0; + + mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; + sbi = EXT4_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (sbi->s_journal && !handle) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); + +out_err: + ext4_std_error(inode->i_sb, err); +out: + mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + + /* Initialize quotas before so that eventual writes go in + * separate transaction */ + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_rmdir; + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir(inode)) + goto end_rmdir; + + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (!EXT4_DIR_LINK_EMPTY(inode)) + ext4_warning(inode->i_sb, + "empty directory has too many links (%d)", + inode->i_nlink); + inode->i_version++; + clear_nlink(inode); + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext4_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + ext4_dec_count(handle, dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + +end_rmdir: + ext4_journal_stop(handle); + brelse(bh); + return retval; +} + +static int ext4_unlink(struct inode *dir, struct dentry *dentry) +{ + int retval; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; + handle_t *handle; + + trace_ext4_unlink_enter(dir, dentry); + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + dquot_initialize(dir); + dquot_initialize(dentry->d_inode); + + handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + retval = -ENOENT; + bh = ext4_find_entry(dir, &dentry->d_name, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext4_warning(inode->i_sb, + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + set_nlink(inode, 1); + } + retval = ext4_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + ext4_mark_inode_dirty(handle, dir); + drop_nlink(inode); + if (!inode->i_nlink) + ext4_orphan_add(handle, inode); + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + retval = 0; + +end_unlink: + ext4_journal_stop(handle); + brelse(bh); + trace_ext4_unlink_exit(dentry, retval); + return retval; +} + +static int ext4_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) +{ + handle_t *handle; + struct inode *inode; + int l, err, retries = 0; + int credits; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + dquot_initialize(dir); + + if (l > EXT4_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. + */ + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS; + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } +retry: + handle = ext4_journal_start(dir, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0, NULL); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > EXT4_N_BLOCKS * 4) { + inode->i_op = &ext4_symlink_inode_operations; + ext4_set_aops(inode); + /* + * We cannot call page_symlink() with transaction started + * because it calls into ext4_write_begin() which can wait + * for transaction commit if we are running out of space + * and thus we deadlock. So we have to stop transaction now + * and restart it when symlink contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. + */ + drop_nlink(inode); + err = ext4_orphan_add(handle, inode); + ext4_journal_stop(handle); + if (err) + goto err_drop_inode; + err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS + * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext4_journal_start(dir, + EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + set_nlink(inode, 1); + err = ext4_orphan_del(handle, inode); + if (err) { + ext4_journal_stop(handle); + clear_nlink(inode); + goto err_drop_inode; + } + } else { + /* clear the extent format for fast symlink */ + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + inode->i_op = &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, symname, l); + inode->i_size = l-1; + } + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_add_nondir(handle, dentry, inode); +out_stop: + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; +} + +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + + if (inode->i_nlink >= EXT4_LINK_MAX) + return -EMLINK; + + dquot_initialize(dir); + +retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(dir)) + ext4_handle_sync(handle); + + inode->i_ctime = ext4_current_time(inode); + ext4_inc_count(handle, inode); + ihold(inode); + + err = ext4_add_entry(handle, dentry, inode); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); + } else { + drop_nlink(inode); + iput(inode); + } + ext4_journal_stop(handle); + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; + return err; +} + +#define PARENT_INO(buffer, size) \ + (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) + +/* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ +static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + handle_t *handle; + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh, *dir_bh; + struct ext4_dir_entry_2 *old_de, *new_de; + int retval, force_da_alloc = 0; + + dquot_initialize(old_dir); + dquot_initialize(new_dir); + + old_bh = new_bh = dir_bh = NULL; + + /* Initialize quotas before so that eventual writes go + * in separate transaction */ + if (new_dentry->d_inode) + dquot_initialize(new_dentry->d_inode); + handle = ext4_journal_start(old_dir, 2 * + EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); + if (new_bh) { + if (!new_inode) { + brelse(new_bh); + new_bh = NULL; + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir(new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data, + old_dir->i_sb->s_blocksize)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir != old_dir && + EXT4_DIR_LINK_MAX(new_dir)) + goto end_rename; + BUFFER_TRACE(dir_bh, "get_write_access"); + retval = ext4_journal_get_write_access(handle, dir_bh); + if (retval) + goto end_rename; + } + if (!new_bh) { + retval = ext4_add_entry(handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + retval = ext4_journal_get_write_access(handle, new_bh); + if (retval) + goto end_rename; + new_de->inode = cpu_to_le32(old_inode->i_ino); + if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT4_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = + ext4_current_time(new_dir); + ext4_mark_inode_dirty(handle, new_dir); + BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); + retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = ext4_current_time(old_inode); + ext4_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + if (le32_to_cpu(old_de->inode) != old_inode->i_ino || + old_de->name_len != old_dentry->d_name.len || + strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || + (retval = ext4_delete_entry(handle, old_dir, + old_de, old_bh)) == -ENOENT) { + /* old_de could have moved from under us during htree split, so + * make sure that we are deleting the right entry. We might + * also be pointing to a stale entry in the unused part of + * old_bh so just checking inum and the name isn't enough. */ + struct buffer_head *old_bh2; + struct ext4_dir_entry_2 *old_de2; + + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); + if (old_bh2) { + retval = ext4_delete_entry(handle, old_dir, + old_de2, old_bh2); + brelse(old_bh2); + } + } + if (retval) { + ext4_warning(old_dir->i_sb, + "Deleting old file (%lu), %d, error=%d", + old_dir->i_ino, old_dir->i_nlink, retval); + } + + if (new_inode) { + ext4_dec_count(handle, new_inode); + new_inode->i_ctime = ext4_current_time(new_inode); + } + old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); + ext4_update_dx_flag(old_dir); + if (dir_bh) { + PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = + cpu_to_le32(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); + retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); + if (retval) { + ext4_std_error(old_dir->i_sb, retval); + goto end_rename; + } + ext4_dec_count(handle, old_dir); + if (new_inode) { + /* checked empty_dir above, can't have another parent, + * ext4_dec_count() won't work for many-linked dirs */ + clear_nlink(new_inode); + } else { + ext4_inc_count(handle, new_dir); + ext4_update_dx_flag(new_dir); + ext4_mark_inode_dirty(handle, new_dir); + } + } + ext4_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext4_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext4_orphan_add(handle, new_inode); + if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) + force_da_alloc = 1; + } + retval = 0; + +end_rename: + brelse(dir_bh); + brelse(old_bh); + brelse(new_bh); + ext4_journal_stop(handle); + if (retval == 0 && force_da_alloc) + ext4_alloc_da_blocks(old_inode); + return retval; +} + +/* + * directories can handle most operations... + */ +const struct inode_operations ext4_dir_inode_operations = { + .create = ext4_create, + .lookup = ext4_lookup, + .link = ext4_link, + .unlink = ext4_unlink, + .symlink = ext4_symlink, + .mkdir = ext4_mkdir, + .rmdir = ext4_rmdir, + .mknod = ext4_mknod, + .rename = ext4_rename, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .get_acl = ext4_get_acl, + .fiemap = ext4_fiemap, +}; + +const struct inode_operations ext4_special_inode_operations = { + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif + .get_acl = ext4_get_acl, +}; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c new file mode 100644 index 00000000..dcdeef16 --- /dev/null +++ b/fs/ext4/page-io.c @@ -0,0 +1,433 @@ +/* + * linux/fs/ext4/page-io.c + * + * This contains the new page_io functions for ext4 + * + * Written by Theodore Ts'o, 2010. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "ext4_extents.h" + +static struct kmem_cache *io_page_cachep, *io_end_cachep; + +int __init ext4_init_pageio(void) +{ + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); + if (io_page_cachep == NULL) + return -ENOMEM; + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); + if (io_end_cachep == NULL) { + kmem_cache_destroy(io_page_cachep); + return -ENOMEM; + } + return 0; +} + +void ext4_exit_pageio(void) +{ + kmem_cache_destroy(io_end_cachep); + kmem_cache_destroy(io_page_cachep); +} + +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ext4_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + +void ext4_free_io_end(ext4_io_end_t *io) +{ + int i; + + BUG_ON(!io); + if (io->page) + put_page(io->page); + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); + io->num_io_pages = 0; + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) + wake_up_all(ext4_ioend_wq(io->inode)); + kmem_cache_free(io_end_cachep, io); +} + +/* + * check a range of space and convert unwritten extents to written. + * + * Called with inode->i_mutex; we depend on this when we manipulate + * io->flag, since we could otherwise race with ext4_flush_completed_IO() + */ +int ext4_end_io_nolock(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + loff_t offset = io->offset; + ssize_t size = io->size; + int ret = 0; + + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," + "list->prev 0x%p\n", + io, inode->i_ino, io->list.next, io->list.prev); + + ret = ext4_convert_unwritten_extents(inode, offset, size); + if (ret < 0) { + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to written " + "extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); + } + + if (io->iocb) + aio_complete(io->iocb, io->result, 0); + + if (io->flag & EXT4_IO_END_DIRECT) + inode_dio_done(inode); + /* Wake up anyone waiting on unwritten extent conversion */ + if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) + wake_up_all(ext4_ioend_wq(io->inode)); + return ret; +} + +/* + * work on completed aio dio IO, to convert unwritten extents to extents + */ +static void ext4_end_io_work(struct work_struct *work) +{ + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long flags; + + spin_lock_irqsave(&ei->i_completed_io_lock, flags); + if (io->flag & EXT4_IO_END_IN_FSYNC) + goto requeue; + if (list_empty(&io->list)) { + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + goto free; + } + + if (!mutex_trylock(&inode->i_mutex)) { + bool was_queued; +requeue: + was_queued = !!(io->flag & EXT4_IO_END_QUEUED); + io->flag |= EXT4_IO_END_QUEUED; + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + /* + * Requeue the work instead of waiting so that the work + * items queued after this can be processed. + */ + queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); + /* + * To prevent the ext4-dio-unwritten thread from keeping + * requeueing end_io requests and occupying cpu for too long, + * yield the cpu if it sees an end_io request that has already + * been requeued. + */ + if (was_queued) + yield(); + return; + } + list_del_init(&io->list); + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); + (void) ext4_end_io_nolock(io); + mutex_unlock(&inode->i_mutex); +free: + ext4_free_io_end(io); +} + +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) +{ + ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); + if (io) { + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; + INIT_WORK(&io->work, ext4_end_io_work); + INIT_LIST_HEAD(&io->list); + } + return io; +} + +/* + * Print an buffer I/O error compatible with the fs/buffer.c. This + * provides compatibility with dmesg scrapers that look for a specific + * buffer I/O error message. We really need a unified error reporting + * structure to userspace ala Digital Unix's uerf system, but it's + * probably not going to happen in my lifetime, due to LKML politics... + */ +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +static void ext4_end_bio(struct bio *bio, int error) +{ + ext4_io_end_t *io_end = bio->bi_private; + struct workqueue_struct *wq; + struct inode *inode; + unsigned long flags; + int i; + sector_t bi_sector = bio->bi_sector; + + BUG_ON(!io_end); + bio->bi_private = NULL; + bio->bi_end_io = NULL; + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = 0; + bio_put(bio); + + for (i = 0; i < io_end->num_io_pages; i++) { + struct page *page = io_end->pages[i]->p_page; + struct buffer_head *bh, *head; + loff_t offset; + loff_t io_end_offset; + + if (error) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + head = page_buffers(page); + BUG_ON(!head); + + io_end_offset = io_end->offset + io_end->size; + + offset = (sector_t) page->index << PAGE_CACHE_SHIFT; + bh = head; + do { + if ((offset >= io_end->offset) && + (offset+bh->b_size <= io_end_offset)) + buffer_io_error(bh); + + offset += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + put_io_page(io_end->pages[i]); + } + io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bi_sector >> (inode->i_blkbits - 9)); + } + + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); + return; + } + + /* Add the io_end to per-inode completed io list*/ + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); + + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; + /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); +} + +void ext4_io_submit(struct ext4_io_submit *io) +{ + struct bio *bio = io->io_bio; + + if (bio) { + bio_get(io->io_bio); + submit_bio(io->io_op, io->io_bio); + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); + bio_put(io->io_bio); + } + io->io_bio = NULL; + io->io_op = 0; + io->io_end = NULL; +} + +static int io_submit_init(struct ext4_io_submit *io, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + struct page *page = bh->b_page; + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio; + + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) + return -ENOMEM; + bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_private = io->io_end = io_end; + bio->bi_end_io = ext4_end_bio; + + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + + io->io_bio = bio; + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + io->io_next_block = bh->b_blocknr; + return 0; +} + +static int io_submit_add_bh(struct ext4_io_submit *io, + struct ext4_io_page *io_page, + struct inode *inode, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + ext4_io_end_t *io_end; + int ret; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (!buffer_mapped(bh) || buffer_delay(bh)) { + if (!buffer_mapped(bh)) + clear_buffer_dirty(bh); + if (io->io_bio) + ext4_io_submit(io); + return 0; + } + + if (io->io_bio && bh->b_blocknr != io->io_next_block) { +submit_and_retry: + ext4_io_submit(io); + } + if (io->io_bio == NULL) { + ret = io_submit_init(io, inode, wbc, bh); + if (ret) + return ret; + } + io_end = io->io_end; + if ((io_end->num_io_pages >= MAX_IO_PAGES) && + (io_end->pages[io_end->num_io_pages-1] != io_page)) + goto submit_and_retry; + if (buffer_uninit(bh)) + ext4_set_io_unwritten_flag(inode, io_end); + io->io_end->size += bh->b_size; + io->io_next_block++; + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (ret != bh->b_size) + goto submit_and_retry; + if ((io_end->num_io_pages == 0) || + (io_end->pages[io_end->num_io_pages-1] != io_page)) { + io_end->pages[io_end->num_io_pages++] = io_page; + atomic_inc(&io_page->p_count); + } + return 0; +} + +int ext4_bio_write_page(struct ext4_io_submit *io, + struct page *page, + int len, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned block_start, block_end, blocksize; + struct ext4_io_page *io_page; + struct buffer_head *bh, *head; + int ret = 0; + + blocksize = 1 << inode->i_blkbits; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + + io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); + if (!io_page) { + set_page_dirty(page); + unlock_page(page); + return -ENOMEM; + } + io_page->p_page = page; + atomic_set(&io_page->p_count, 1); + get_page(page); + set_page_writeback(page); + ClearPageError(page); + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + block_end = block_start + blocksize; + if (block_start >= len) { + /* + * Comments copied from block_write_full_page_endio: + * + * The page straddles i_size. It must be zeroed out on + * each and every writepage invocation because it may + * be mmapped. "A file is mapped in multiples of the + * page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when + * mapped, and writes to that region are not written + * out to the file." + */ + zero_user_segment(page, block_start, block_end); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + clear_buffer_dirty(bh); + ret = io_submit_add_bh(io, io_page, inode, wbc, bh); + if (ret) { + /* + * We only get here on ENOMEM. Not much else + * we can do but mark the page as dirty, and + * better luck next time. + */ + set_page_dirty(page); + break; + } + } + unlock_page(page); + /* + * If the page was truncated before we could do the writeback, + * or we had a memory allocation error while trying to write + * the first buffer head, we won't have submitted any pages for + * I/O. In that case we need to make sure we've cleared the + * PageWriteback bit from the page to prevent the system from + * wedging later on. + */ + put_io_page(io_page); + return ret; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c new file mode 100644 index 00000000..53589ff8 --- /dev/null +++ b/fs/ext4/resize.c @@ -0,0 +1,1689 @@ +/* + * linux/fs/ext4/resize.c + * + * Support for resizing an ext4 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + + +#define EXT4FS_DEBUG + +#include +#include + +#include "ext4_jbd2.h" + +int ext4_resize_begin(struct super_block *sb) +{ + int ret = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + /* + * We are not allowed to do online-resizing on a filesystem mounted + * with error, because it can destroy the filesystem easily. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + ext4_warning(sb, "There are errors in the filesystem, " + "so online resizing is not allowed\n"); + return -EPERM; + } + + if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) + ret = -EBUSY; + + return ret; +} + +void ext4_resize_end(struct super_block *sb) +{ + clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); + smp_mb__after_clear_bit(); +} + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext4_new_group_data *input) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t start = ext4_blocks_count(es); + ext4_fsblk_t end = start + input->blocks_count; + ext4_group_t group = input->group; + ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; + unsigned overhead = ext4_bg_has_super(sb, group) ? + (1 + ext4_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + ext4_fsblk_t metaend = start + overhead; + struct buffer_head *bh = NULL; + ext4_grpblk_t free_blocks_count, offset; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext4_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + ext4_get_group_no_and_offset(sb, start, NULL, &offset); + if (group != sbi->s_groups_count) + ext4_warning(sb, "Cannot add at group %u (only %u groups)", + input->group, sbi->s_groups_count); + else if (offset != 0) + ext4_warning(sb, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext4_warning(sb, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext4_warning(sb, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext4_warning(sb, "Cannot read last block (%llu)", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext4_warning(sb, "Block bitmap not in group (block %llu)", + (unsigned long long)input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext4_warning(sb, "Inode bitmap not in group (block %llu)", + (unsigned long long)input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", + (unsigned long long)input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", + (unsigned long long)input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Block bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->block_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext4_warning(sb, "Inode bitmap (%llu) in inode table " + "(%llu-%llu)", + (unsigned long long)input->inode_bitmap, + (unsigned long long)input->inode_table, itend - 1); + else if (inside(input->block_bitmap, start, metaend)) + ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->block_bitmap, + start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", + (unsigned long long)input->inode_bitmap, + start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " + "(%llu-%llu)", + (unsigned long long)input->inode_table, + itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +/* + * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex + * group each time. + */ +struct ext4_new_flex_group_data { + struct ext4_new_group_data *groups; /* new_group_data for groups + in the flex group */ + __u16 *bg_flags; /* block group flags of groups + in @groups */ + ext4_group_t count; /* number of groups in @groups + */ +}; + +/* + * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of + * @flexbg_size. + * + * Returns NULL on failure otherwise address of the allocated structure. + */ +static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) +{ + struct ext4_new_flex_group_data *flex_gd; + + flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); + if (flex_gd == NULL) + goto out3; + + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + goto out2; + flex_gd->count = flexbg_size; + + flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * + flexbg_size, GFP_NOFS); + if (flex_gd->groups == NULL) + goto out2; + + flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); + if (flex_gd->bg_flags == NULL) + goto out1; + + return flex_gd; + +out1: + kfree(flex_gd->groups); +out2: + kfree(flex_gd); +out3: + return NULL; +} + +static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) +{ + kfree(flex_gd->bg_flags); + kfree(flex_gd->groups); + kfree(flex_gd); +} + +/* + * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps + * and inode tables for a flex group. + * + * This function is used by 64bit-resize. Note that this function allocates + * group tables from the 1st group of groups contained by @flexgd, which may + * be a partial of a flex group. + * + * @sb: super block of fs to which the groups belongs + */ +static void ext4_alloc_group_tables(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + int flexbg_size) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + ext4_fsblk_t start_blk; + ext4_fsblk_t last_blk; + ext4_group_t src_group; + ext4_group_t bb_index = 0; + ext4_group_t ib_index = 0; + ext4_group_t it_index = 0; + ext4_group_t group; + ext4_group_t last_group; + unsigned overhead; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + + src_group = group_data[0].group; + last_group = src_group + flex_gd->count - 1; + + BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != + (last_group & ~(flexbg_size - 1)))); +next_group: + group = group_data[0].group; + start_blk = ext4_group_first_block_no(sb, src_group); + last_blk = start_blk + group_data[src_group - group].blocks_count; + + overhead = ext4_bg_has_super(sb, src_group) ? + (1 + ext4_bg_num_gdb(sb, src_group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + + start_blk += overhead; + + BUG_ON(src_group >= group_data[0].group + flex_gd->count); + /* We collect contiguous blocks as much as possible. */ + src_group++; + for (; src_group <= last_group; src_group++) + if (!ext4_bg_has_super(sb, src_group)) + last_blk += group_data[src_group - group].blocks_count; + else + break; + + /* Allocate block bitmaps */ + for (; bb_index < flex_gd->count; bb_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[bb_index].block_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode bitmaps */ + for (; ib_index < flex_gd->count; ib_index++) { + if (start_blk >= last_blk) + goto next_group; + group_data[ib_index].inode_bitmap = start_blk++; + ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count--; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + } + + /* Allocate inode tables */ + for (; it_index < flex_gd->count; it_index++) { + if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + goto next_group; + group_data[it_index].inode_table = start_blk; + ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); + group -= group_data[0].group; + group_data[group].free_blocks_count -= + EXT4_SB(sb)->s_itb_per_group; + if (flexbg_size > 1) + flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + + start_blk += EXT4_SB(sb)->s_itb_per_group; + } + + if (test_opt(sb, DEBUG)) { + int i; + group = group_data[0].group; + + printk(KERN_DEBUG "EXT4-fs: adding a flex group with " + "%d groups, flexbg size is %d:\n", flex_gd->count, + flexbg_size); + + for (i = 0; i < flex_gd->count; i++) { + printk(KERN_DEBUG "adding %s group %u: %u " + "blocks (%d free)\n", + ext4_bg_has_super(sb, group + i) ? "normal" : + "no-super", group + i, + group_data[i].blocks_count, + group_data[i].free_blocks_count); + } + } +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + ext4_fsblk_t blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if (!bh) + return ERR_PTR(-EIO); + if ((err = ext4_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + } + + return bh; +} + +/* + * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for block_bitmap modifications. + */ +static int extend_or_restart_transaction(handle_t *handle, int thresh) +{ + int err; + + if (ext4_handle_has_enough_credits(handle, thresh)) + return 0; + + err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); + if (err < 0) + return err; + if (err) { + err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); + if (err) + return err; + } + + return 0; +} + +/* + * set_flexbg_block_bitmap() mark @count blocks starting from @block used. + * + * Helper function for ext4_setup_new_group_blocks() which set . + * + * @sb: super block + * @handle: journal handle + * @flex_gd: flex group data + */ +static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t block, ext4_group_t count) +{ + ext4_group_t count2; + + ext4_debug("mark blocks [%llu/%u] used\n", block, count); + for (count2 = count; count > 0; count -= count2, block += count2) { + ext4_fsblk_t start; + struct buffer_head *bh; + ext4_group_t group; + int err; + + ext4_get_group_no_and_offset(sb, block, &group, NULL); + start = ext4_group_first_block_no(sb, group); + group -= flex_gd->groups[0].group; + + count2 = sb->s_blocksize * 8 - (block - start); + if (count2 > count) + count2 = count; + + if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { + BUG_ON(flex_gd->count > 1); + continue; + } + + err = extend_or_restart_transaction(handle, 1); + if (err) + return err; + + bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); + if (!bh) + return -EIO; + + err = ext4_journal_get_write_access(handle, bh); + if (err) + return err; + ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, + block - start, count2); + ext4_set_bits(bh->b_data, block - start, count2); + + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + return err; + brelse(bh); + } + + return 0; +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new groups. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + * + * setup_new_flex_group_blocks handles a flex group as follow: + * 1. copy super block and GDT, and initialize group tables if necessary. + * In this step, we only set bits in blocks bitmaps for blocks taken by + * super block and GDT. + * 2. allocate group tables in block bitmaps, that is, set bits in block + * bitmap for blocks taken by group tables. + */ +static int setup_new_flex_group_blocks(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; + ext4_fsblk_t start; + ext4_fsblk_t block; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + __u16 *bg_flags = flex_gd->bg_flags; + handle_t *handle; + ext4_group_t group, count; + struct buffer_head *bh = NULL; + int reserved_gdb, i, j, err = 0, err2; + + BUG_ON(!flex_gd->count || !group_data || + group_data[0].group != sbi->s_groups_count); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + + /* This transaction may be extended/restarted along the way */ + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + group = group_data[0].group; + for (i = 0; i < flex_gd->count; i++, group++) { + unsigned long gdblocks; + + gdblocks = ext4_bg_num_gdb(sb, group); + start = ext4_group_first_block_no(sb, group); + + /* Copy all of the GDT blocks into the backup in this group */ + for (j = 0, block = start + 1; j < gdblocks; j++, block++) { + struct buffer_head *gdb; + + ext4_debug("update backup group %#04llx\n", block); + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + gdb = sb_getblk(sb, block); + if (!gdb) { + err = -EIO; + goto out; + } + + err = ext4_journal_get_write_access(handle, gdb); + if (err) { + brelse(gdb); + goto out; + } + memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, + gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); + if (unlikely(err)) { + brelse(gdb); + goto out; + } + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor + * table blocks + */ + if (ext4_bg_has_super(sb, group)) { + err = sb_issue_zeroout(sb, gdblocks + start + 1, + reserved_gdb, GFP_NOFS); + if (err) + goto out; + } + + /* Initialize group tables of the grop @group */ + if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) + goto handle_bb; + + /* Zero out all of the inode table blocks */ + block = group_data[i].inode_table; + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", + block, sbi->s_itb_per_group); + err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, + GFP_NOFS); + if (err) + goto out; + +handle_bb: + if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) + goto handle_ib; + + /* Initialize block bitmap of the @group */ + block = group_data[i].block_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + if (ext4_bg_has_super(sb, group)) { + ext4_debug("mark backup superblock %#04llx (+0)\n", + start); + ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + + 1); + } + ext4_mark_bitmap_end(group_data[i].blocks_count, + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); + +handle_ib: + if (bg_flags[i] & EXT4_BG_INODE_UNINIT) + continue; + + /* Initialize inode bitmap of the @group */ + block = group_data[i].inode_bitmap; + err = extend_or_restart_transaction(handle, 1); + if (err) + goto out; + /* Mark unused entries in inode bitmap used */ + bh = bclean(handle, sb, block); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + goto out; + } + + ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), + sb->s_blocksize * 8, bh->b_data); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) + goto out; + brelse(bh); + } + bh = NULL; + + /* Mark group tables in block bitmap */ + for (j = 0; j < GROUP_TABLE_COUNT; j++) { + count = group_table_count[j]; + start = (&group_data[0].block_bitmap)[j]; + block = start; + for (i = 1; i < flex_gd->count; i++) { + block += group_table_count[j]; + if (block == (&group_data[i].block_bitmap)[j]) { + count += group_table_count[j]; + continue; + } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + count = group_table_count[j]; + start = group_data[i].block_bitmap; + block = start; + } + + if (count) { + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, start, count); + if (err) + goto out; + } + } + +out: + brelse(bh); + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + ext4_group_t end, + struct buffer_head *primary) +{ + const ext4_fsblk_t blk = primary->b_blocknr; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __le32 *p = (__le32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != + grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ + ext4_warning(sb, "reserved GDT %llu" + " missing grp %d (%llu)", + blk, grp, + grp * + (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + blk); + return -EINVAL; + } + if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + ext4_group_t group) +{ + struct super_block *sb = inode->i_sb; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + struct buffer_head *gdb_bh; + int gdbackups; + struct ext4_iloc iloc; + __le32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_warning(sb, "won't resize using backup superblock at %llu", + (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + gdb_bh = sb_bread(sb, gdblock); + if (!gdb_bh) + return -EIO; + + gdbackups = verify_reserved_gdb(sb, group, gdb_bh); + if (gdbackups < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__le32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext4_warning(sb, "new group %u GDT block %llu not reserved", + group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (unlikely(err)) + goto exit_dind; + + err = ext4_journal_get_write_access(handle, gdb_bh); + if (unlikely(err)) + goto exit_sbh; + + err = ext4_journal_get_write_access(handle, dind); + if (unlikely(err)) + ext4_std_error(sb, err); + + /* ext4_reserve_inode_write() gets a reference on the iloc */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (unlikely(err)) + goto exit_dindj; + + n_group_desc = ext4_kvmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), + GFP_NOFS); + if (!n_group_desc) { + err = -ENOMEM; + ext4_warning(sb, "not enough memory for %lu groups", + gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; + err = ext4_handle_dirty_metadata(handle, NULL, dind); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + memset(gdb_bh->b_data, 0, sb->s_blocksize); + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + goto exit_inode; + } + brelse(dind); + + o_group_desc = EXT4_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = gdb_bh; + EXT4_SB(sb)->s_group_desc = n_group_desc; + EXT4_SB(sb)->s_gdb_count++; + ext4_kvfree(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + if (err) + ext4_std_error(sb, err); + + return err; + +exit_inode: + ext4_kvfree(n_group_desc); + /* ext4_handle_release_buffer(handle, iloc.bh); */ + brelse(iloc.bh); +exit_dindj: + /* ext4_handle_release_buffer(handle, dind); */ +exit_sbh: + /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ +exit_dind: + brelse(dind); +exit_bh: + brelse(gdb_bh); + + ext4_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + ext4_group_t group) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext4_iloc iloc; + ext4_fsblk_t blk; + __le32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); + if (!primary) + return -ENOMEM; + + data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; + data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % + EXT4_ADDR_PER_BLOCK(sb)); + end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext4_warning(sb, "reserved block %llu" + " not at offset %ld", + blk, + (long)(data - (__le32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + gdbackups = verify_reserved_gdb(sb, group, primary[res]); + if (gdbackups < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__le32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext4_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext4_handle_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = group * EXT4_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__le32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext4_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext4 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const ext4_group_t last = sbi->s_groups_count; + const int bpg = EXT4_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + ext4_group_t group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (ext4_handle_valid(handle) && + handle->h_buffer_credits == 0 && + ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && + (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + if (!bh) { + err = -EIO; + break; + } + ext4_debug("update metadata backup %#04lx\n", + (unsigned long)bh->b_blocknr); + if ((err = ext4_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (unlikely(err)) + ext4_std_error(sb, err); + brelse(bh); + } + if ((err2 = ext4_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext4_warning(sb, "can't update backup for group %u (err %d), " + "forcing fsck on next reboot", group, err); + sbi->s_mount_state &= ~EXT4_VALID_FS; + sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* + * ext4_add_new_descs() adds @count group descriptor of groups + * starting at @group + * + * @handle: journal handle + * @sb: super block + * @group: the group no. of the first group desc to be added + * @resize_inode: the resize inode + * @count: number of group descriptors to be added + */ +static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, + ext4_group_t group, struct inode *resize_inode, + ext4_group_t count) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *gdb_bh; + int i, gdb_off, gdb_num, err = 0; + + for (i = 0; i < count; i++, group++) { + int reserved_gdb = ext4_bg_has_super(sb, group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + gdb_bh = sbi->s_group_desc[gdb_num]; + err = ext4_journal_get_write_access(handle, gdb_bh); + + if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) + err = reserve_backup_gdb(handle, resize_inode, group); + } else + err = add_new_gdb(handle, resize_inode, group); + if (err) + break; + } + return err; +} + +/* + * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg + */ +static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_group_desc *gdp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *gdb_bh; + ext4_group_t group; + __u16 *bg_flags = flex_gd->bg_flags; + int i, gdb_off, gdb_num, err = 0; + + + for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { + group = group_data->group; + + gdb_off = group % EXT4_DESC_PER_BLOCK(sb); + gdb_num = group / EXT4_DESC_PER_BLOCK(sb); + + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ + gdb_bh = sbi->s_group_desc[gdb_num]; + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); + + memset(gdp, 0, EXT4_DESC_SIZE(sb)); + ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); + ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + ext4_inode_table_set(sb, gdp, group_data->inode_table); + ext4_free_group_clusters_set(sb, gdp, + EXT4_B2C(sbi, group_data->free_blocks_count)); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags = cpu_to_le16(*bg_flags); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); + if (unlikely(err)) { + ext4_std_error(sb, err); + break; + } + + /* + * We can allocate memory for mb_alloc based on the new group + * descriptor + */ + err = ext4_mb_add_groupinfo(sb, group, gdp); + if (err) + break; + } + return err; +} + +/* + * ext4_update_super() updates the super block so that the newly added + * groups can be seen by the filesystem. + * + * @sb: super block + * @flex_gd: new added groups + */ +static void ext4_update_super(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd) +{ + ext4_fsblk_t blocks_count = 0; + ext4_fsblk_t free_blocks = 0; + ext4_fsblk_t reserved_blocks = 0; + struct ext4_new_group_data *group_data = flex_gd->groups; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i; + + BUG_ON(flex_gd->count == 0 || group_data == NULL); + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + for (i = 0; i < flex_gd->count; i++) { + blocks_count += group_data[i].blocks_count; + free_blocks += group_data[i].free_blocks_count; + } + + reserved_blocks = ext4_r_blocks_count(es) * 100; + do_div(reserved_blocks, ext4_blocks_count(es)); + reserved_blocks *= blocks_count; + do_div(reserved_blocks, 100); + + ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * + flex_gd->count); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers must perform a smp_wmb() after updating all + * dependent data and before modifying the groups count + * + * * Readers must perform an smp_rmb() after reading the groups + * count and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + sbi->s_groups_count += flex_gd->count; + + /* Update the reserved block counts only once the new group is + * active. */ + ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + + reserved_blocks); + + /* Update the free space counts */ + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_B2C(sbi, free_blocks)); + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_FLEX_BG) && + sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, group_data[0].group); + atomic_add(EXT4_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, + &sbi->s_flex_groups[flex_group].free_inodes); + } + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" + "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, + blocks_count, free_blocks, reserved_blocks); +} + +/* Add a flex group to an fs. Ensure we handle all possible error conditions + * _before_ we start modifying the filesystem, because we cannot abort the + * transaction and not have it write the data to disk. + */ +static int ext4_flex_group_add(struct super_block *sb, + struct inode *resize_inode, + struct ext4_new_flex_group_data *flex_gd) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_group_t group; + handle_t *handle; + unsigned reserved_gdb; + int err = 0, err2 = 0, credit; + + BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); + + reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); + o_blocks_count = ext4_blocks_count(es); + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + + err = setup_new_flex_group_blocks(sb, flex_gd); + if (err) + goto exit; + /* + * We will always be modifying at least the superblock and GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + credit = flex_gd->count * 4 + reserved_gdb; + handle = ext4_journal_start_sb(sb, credit); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit; + } + + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto exit_journal; + + group = flex_gd->groups[0].group; + BUG_ON(group != EXT4_SB(sb)->s_groups_count); + err = ext4_add_new_descs(handle, sb, group, + resize_inode, flex_gd->count); + if (err) + goto exit_journal; + + err = ext4_setup_new_descs(handle, sb, flex_gd); + if (err) + goto exit_journal; + + ext4_update_super(sb, flex_gd); + + err = ext4_handle_dirty_super(handle, sb); + +exit_journal: + err2 = ext4_journal_stop(handle); + if (!err) + err = err2; + + if (!err) { + int i; + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + for (i = 0; i < flex_gd->count; i++, group++) { + struct buffer_head *gdb_bh; + int gdb_num; + gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); + gdb_bh = sbi->s_group_desc[gdb_num]; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, + gdb_bh->b_size); + } + } +exit: + return err; +} + +static int ext4_setup_next_flex_gd(struct super_block *sb, + struct ext4_new_flex_group_data *flex_gd, + ext4_fsblk_t n_blocks_count, + unsigned long flexbg_size) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_new_group_data *group_data = flex_gd->groups; + ext4_fsblk_t o_blocks_count; + ext4_group_t n_group; + ext4_group_t group; + ext4_group_t last_group; + ext4_grpblk_t last; + ext4_grpblk_t blocks_per_group; + unsigned long i; + + blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); + + o_blocks_count = ext4_blocks_count(es); + + if (o_blocks_count == n_blocks_count) + return 0; + + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + BUG_ON(last); + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); + + last_group = group | (flexbg_size - 1); + if (last_group > n_group) + last_group = n_group; + + flex_gd->count = last_group - group + 1; + + for (i = 0; i < flex_gd->count; i++) { + int overhead; + + group_data[i].group = group + i; + group_data[i].blocks_count = blocks_per_group; + overhead = ext4_bg_has_super(sb, group + i) ? + (1 + ext4_bg_num_gdb(sb, group + i) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + group_data[i].free_blocks_count = blocks_per_group - overhead; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | + EXT4_BG_INODE_UNINIT; + else + flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; + } + + if (last_group == n_group && + EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + /* We need to initialize block bitmap of last group. */ + flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; + + if ((last_group == n_group) && (last != blocks_per_group - 1)) { + group_data[i - 1].blocks_count = last + 1; + group_data[i - 1].free_blocks_count -= blocks_per_group- + last - 1; + } + + return 1; +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) +{ + struct ext4_new_flex_group_data flex_gd; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int reserved_gdb = ext4_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct inode *inode = NULL; + int gdb_off, gdb_num; + int err; + __u16 bg_flags = 0; + + gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext4_warning(sb, "Can't resize non-sparse filesystem further"); + return -EPERM; + } + + if (ext4_blocks_count(es) + input->blocks_count < + ext4_blocks_count(es)) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext4_warning(sb, "inodes_count overflow"); + return -EINVAL; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT4_HAS_COMPAT_FEATURE(sb, + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + ext4_warning(sb, + "No reserved GDT blocks, can't resize"); + return -EPERM; + } + inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(inode); + } + } + + + err = verify_group_input(sb, input); + if (err) + goto out; + + flex_gd.count = 1; + flex_gd.groups = input; + flex_gd.bg_flags = &bg_flags; + err = ext4_flex_group_add(sb, inode, &flex_gd); +out: + iput(inode); + return err; +} /* ext4_group_add */ + +/* + * extend a group without checking assuming that checking has been done. + */ +static int ext4_group_extend_no_check(struct super_block *sb, + ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + handle_t *handle; + int err = 0, err2; + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext4_group_add_blocks(). + */ + handle = ext4_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_warning(sb, "error %d on journal start", err); + return err; + } + + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) { + ext4_warning(sb, "error %d on journal write access", err); + goto errout; + } + + ext4_blocks_count_set(es, o_blocks_count + add); + ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); + ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); + /* We add the blocks to the bitmap and set the group need init bit */ + err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); + if (err) + goto errout; + ext4_handle_dirty_super(handle, sb); + ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, + o_blocks_count + add); +errout: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + + if (!err) { + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: extended group to %llu " + "blocks\n", ext4_blocks_count(es)); + update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext4_super_block)); + } + return err; +} + +/* + * Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext4_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count) +{ + ext4_fsblk_t o_blocks_count; + ext4_grpblk_t last; + ext4_grpblk_t add; + struct buffer_head *bh; + int err; + ext4_group_t group; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "extending last group from %llu to %llu blocks", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + ext4_msg(sb, KERN_ERR, + "filesystem too large to resize to %llu blocks safely", + n_blocks_count); + if (sizeof(sector_t) < 8) + ext4_warning(sb, "CONFIG_LBDAF not enabled"); + return -EINVAL; + } + + if (n_blocks_count < o_blocks_count) { + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + /* Handle the remaining blocks in the last group only. */ + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); + + if (last == 0) { + ext4_warning(sb, "need to use ext2online to resize further"); + return -EPERM; + } + + add = EXT4_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add < o_blocks_count) { + ext4_warning(sb, "blocks_count overflow"); + return -EINVAL; + } + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext4_warning(sb, "will only finish group (%llu blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + return err; +} /* ext4_group_extend */ + +/* + * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count + * + * @sb: super block of the fs to be resized + * @n_blocks_count: the number of blocks resides in the resized fs + */ +int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) +{ + struct ext4_new_flex_group_data *flex_gd = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *bh; + struct inode *resize_inode; + ext4_fsblk_t o_blocks_count; + ext4_group_t o_group; + ext4_group_t n_group; + ext4_grpblk_t offset, add; + unsigned long n_desc_blocks; + unsigned long o_desc_blocks; + unsigned long desc_blocks; + int err = 0, flexbg_size = 1; + + o_blocks_count = ext4_blocks_count(es); + + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " + "to %llu blocks", o_blocks_count, n_blocks_count); + + if (n_blocks_count < o_blocks_count) { + /* On-line shrinking not supported */ + ext4_warning(sb, "can't shrink FS - resize aborted"); + return -EINVAL; + } + + if (n_blocks_count == o_blocks_count) + /* Nothing need to do */ + return 0; + + ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); + ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); + + n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / + EXT4_DESC_PER_BLOCK(sb); + o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + desc_blocks = n_desc_blocks - o_desc_blocks; + + if (desc_blocks && + (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || + le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { + ext4_warning(sb, "No reserved GDT blocks, can't resize"); + return -EPERM; + } + + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + if (IS_ERR(resize_inode)) { + ext4_warning(sb, "Error opening resize inode"); + return PTR_ERR(resize_inode); + } + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, n_blocks_count - 1); + if (!bh) { + ext4_warning(sb, "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* extend the last group */ + if (n_group == o_group) + add = n_blocks_count - o_blocks_count; + else + add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); + if (add > 0) { + err = ext4_group_extend_no_check(sb, o_blocks_count, add); + if (err) + goto out; + } + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && + es->s_log_groups_per_flex) + flexbg_size = 1 << es->s_log_groups_per_flex; + + o_blocks_count = ext4_blocks_count(es); + if (o_blocks_count == n_blocks_count) + goto out; + + flex_gd = alloc_flex_gd(flexbg_size); + if (flex_gd == NULL) { + err = -ENOMEM; + goto out; + } + + /* Add flex groups. Note that a regular group is a + * flex group with 1 group. + */ + while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, + flexbg_size)) { + ext4_alloc_group_tables(sb, flex_gd, flexbg_size); + err = ext4_flex_group_add(sb, resize_inode, flex_gd); + if (unlikely(err)) + break; + } + +out: + if (flex_gd) + free_flex_gd(flex_gd); + + iput(resize_inode); + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " + "upto %llu blocks", o_blocks_count, n_blocks_count); + return err; +} diff --git a/fs/ext4/super.c b/fs/ext4/super.c new file mode 100644 index 00000000..a68703a5 --- /dev/null +++ b/fs/ext4/super.c @@ -0,0 +1,4980 @@ +/* + * linux/fs/ext4/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ext4.h" +#include "ext4_extents.h" +#include "ext4_jbd2.h" +#include "xattr.h" +#include "acl.h" +#include "mballoc.h" + +#define CREATE_TRACE_POINTS +#include + +static struct proc_dir_entry *ext4_proc_root; +static struct kset *ext4_kset; +static struct ext4_lazy_init *ext4_li_info; +static struct mutex ext4_li_mtx; +static struct ext4_features *ext4_feat; + +static int ext4_load_journal(struct super_block *, struct ext4_super_block *, + unsigned long journal_devnum); +static int ext4_show_options(struct seq_file *seq, struct dentry *root); +static int ext4_commit_super(struct super_block *sb, int sync); +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); +static int ext4_sync_fs(struct super_block *sb, int wait); +static const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]); +static int ext4_remount(struct super_block *sb, int *flags, char *data); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); +static int ext4_unfreeze(struct super_block *sb); +static void ext4_write_super(struct super_block *sb); +static int ext4_freeze(struct super_block *sb); +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); +static inline int ext2_feature_set_ok(struct super_block *sb); +static inline int ext3_feature_set_ok(struct super_block *sb); +static int ext4_feature_set_ok(struct super_block *sb, int readonly); +static void ext4_destroy_lazyinit_thread(void); +static void ext4_unregister_li_request(struct super_block *sb); +static void ext4_clear_request_list(void); + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext2_fs_type = { + .owner = THIS_MODULE, + .name = "ext2", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#else +#define IS_EXT2_SB(sb) (0) +#endif + + +#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static struct file_system_type ext3_fs_type = { + .owner = THIS_MODULE, + .name = "ext3", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#else +#define IS_EXT3_SB(sb) (0) +#endif + +void *ext4_kvmalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kmalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags, PAGE_KERNEL); + return ret; +} + +void *ext4_kvzalloc(size_t size, gfp_t flags) +{ + void *ret; + + ret = kzalloc(size, flags); + if (!ret) + ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); + return ret; +} + +void ext4_kvfree(void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); + +} + +ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_block_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_bitmap_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); +} + +ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le32_to_cpu(bg->bg_inode_table_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); +} + +__u32 ext4_free_group_clusters(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_inodes_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + +void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); +} + +void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk) +{ + bg->bg_inode_table_lo = cpu_to_le32((u32)blk); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); +} + +void ext4_free_group_clusters_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + + +/* Just increment the non-pointer handle value */ +static handle_t *ext4_get_nojournal(void) +{ + handle_t *handle = current->journal_info; + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); + + ref_cnt++; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; + return handle; +} + + +/* Decrement the non-pointer handle value */ +static void ext4_put_nojournal(handle_t *handle) +{ + unsigned long ref_cnt = (unsigned long)handle; + + BUG_ON(ref_cnt == 0); + + ref_cnt--; + handle = (handle_t *)ref_cnt; + + current->journal_info = handle; +} + +/* + * Wrappers for jbd2_journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + * + * To avoid j_barrier hold in userspace when a user calls freeze(), + * ext4 prevents a new handle from being started by s_frozen, which + * is in an upper layer. + */ +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) +{ + journal_t *journal; + handle_t *handle; + + trace_ext4_journal_start(sb, nblocks, _RET_IP_); + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + + journal = EXT4_SB(sb)->s_journal; + handle = ext4_journal_current_handle(); + + /* + * If a handle has been started, it should be allowed to + * finish, otherwise deadlock could happen between freeze + * and others(e.g. truncate) due to the restart of the + * journal handle if the filesystem is forzen and active + * handles are not stopped. + */ + if (!handle) + vfs_check_frozen(sb, SB_FREEZE_TRANS); + + if (!journal) + return ext4_get_nojournal(); + /* + * Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. + */ + if (is_journal_aborted(journal)) { + ext4_abort(sb, "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2_journal_start(journal, nblocks); +} + +/* + * The only special thing we need to do here is to make sure that all + * jbd2_journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) +{ + struct super_block *sb; + int err; + int rc; + + if (!ext4_handle_valid(handle)) { + ext4_put_nojournal(handle); + return 0; + } + sb = handle->h_transaction->t_journal->j_private; + err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) + err = rc; + if (err) + __ext4_std_error(sb, where, line, err); + return err; +} + +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) +{ + char nbuf[16]; + const char *errstr = ext4_decode_error(NULL, err, nbuf); + + BUG_ON(!ext4_handle_valid(handle)); + + if (bh) + BUFFER_TRACE(bh, "abort"); + + if (!handle->h_err) + handle->h_err = err; + + if (is_handle_aborted(handle)) + return; + + printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); + + jbd2_journal_abort_handle(handle); +} + +static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + es->s_last_error_time = cpu_to_le32(get_seconds()); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); +} + +static void save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + __save_error_info(sb, func, line); + ext4_commit_super(sb, 1); +} + +/* + * The del_gendisk() function uninitializes the disk-specific data + * structures, including the bdi structure, without telling anyone + * else. Once this happens, any attempt to call mark_buffer_dirty() + * (for example, by ext4_commit_super), will cause a kernel OOPS. + * This is a kludge to prevent these oops until we can put in a proper + * hook in del_gendisk() to inform the VFS and file system layers. + */ +static int block_device_ejected(struct super_block *sb) +{ + struct inode *bd_inode = sb->s_bdev->bd_inode; + struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + + return bdi->dev == NULL; +} + +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) +{ + struct super_block *sb = journal->j_private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int error = is_journal_aborted(journal); + struct ext4_journal_cb_entry *jce, *tmp; + + spin_lock(&sbi->s_md_lock); + list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { + list_del_init(&jce->jce_list); + spin_unlock(&sbi->s_md_lock); + jce->jce_func(sb, jce, error); + spin_lock(&sbi->s_md_lock); + } + spin_unlock(&sbi->s_md_lock); +} + +/* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext4, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the jbd2_journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will complain about + * that error until we've noted it down and cleared it. + */ + +static void ext4_handle_error(struct super_block *sb) +{ + if (sb->s_flags & MS_RDONLY) + return; + + if (!test_opt(sb, ERRORS_CONT)) { + journal_t *journal = EXT4_SB(sb)->s_journal; + + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (journal) + jbd2_journal_abort(journal, -EIO); + } + if (test_opt(sb, ERRORS_RO)) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); +} + +void __ext4_error(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", + sb->s_id, function, line, current->comm, &vaf); + va_end(args); + save_error_info(sb, function, line); + + ext4_handle_error(sb); +} + +void ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + es->s_last_error_block = cpu_to_le64(block); + save_error_info(inode->i_sb, function, line); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: block %llu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, &vaf); + else + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, &vaf); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +void ext4_error_file(struct file *file, const char *function, + unsigned int line, ext4_fsblk_t block, + const char *fmt, ...) +{ + va_list args; + struct va_format vaf; + struct ext4_super_block *es; + struct inode *inode = file->f_dentry->d_inode; + char pathname[80], *path; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + save_error_info(inode->i_sb, function, line); + path = d_path(&(file->f_path), pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (block) + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "block %llu: comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + block, current->comm, path, &vaf); + else + printk(KERN_CRIT + "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "comm %s: path %s: %pV\n", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path, &vaf); + va_end(args); + + ext4_handle_error(inode->i_sb); +} + +static const char *ext4_decode_error(struct super_block *sb, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +/* __ext4_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) +{ + char nbuf[16]; + const char *errstr; + + /* Special case: if the error is EROFS, and we're not already + * inside a transaction, then there's really no point in logging + * an error. */ + if (errno == -EROFS && journal_current_handle() == NULL && + (sb->s_flags & MS_RDONLY)) + return; + + errstr = ext4_decode_error(sb, errno, nbuf); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(sb, function, line); + + ext4_handle_error(sb); +} + +/* + * ext4_abort is a much stronger failure handler than ext4_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + +void __ext4_abort(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + va_list args; + + save_error_info(sb, function, line); + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, + function, line); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if ((sb->s_flags & MS_RDONLY) == 0) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } + if (test_opt(sb, ERRORS_PANIC)) + panic("EXT4-fs panic from previous error\n"); +} + +void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + +void __ext4_warning(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", + sb->s_id, function, line, &vaf); + va_end(args); +} + +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + struct va_format vaf; + va_list args; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + __save_error_info(sb, function, line); + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", + sb->s_id, function, line, grp); + if (ino) + printk(KERN_CONT "inode %lu: ", ino); + if (block) + printk(KERN_CONT "block %llu:", (unsigned long long) block); + printk(KERN_CONT "%pV\n", &vaf); + va_end(args); + + if (test_opt(sb, ERRORS_CONT)) { + ext4_commit_super(sb, 0); + return; + } + + ext4_unlock_group(sb, grp); + ext4_handle_error(sb); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish between the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + +void ext4_update_dynamic_rev(struct super_block *sb) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) + return; + + ext4_warning(sb, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT4_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ +} + +/* + * Open the external journal device + */ +static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) +{ + struct block_device *bdev; + char b[BDEVNAME_SIZE]; + + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + if (IS_ERR(bdev)) + goto fail; + return bdev; + +fail: + ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", + __bdevname(dev, b), PTR_ERR(bdev)); + return NULL; +} + +/* + * Release the journal device + */ +static int ext4_blkdev_put(struct block_device *bdev) +{ + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +static int ext4_blkdev_remove(struct ext4_sb_info *sbi) +{ + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext4_blkdev_put(bdev); + sbi->journal_bdev = NULL; + } + return ret; +} + +static inline struct inode *orphan_list_entry(struct list_head *l) +{ + return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; +} + +static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) +{ + struct list_head *l; + + ext4_msg(sb, KERN_ERR, "sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + inode->i_sb->s_id, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + NEXT_ORPHAN(inode)); + } +} + +static void ext4_put_super(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int i, err; + + ext4_unregister_li_request(sb); + dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + + flush_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->dio_unwritten_wq); + + lock_super(sb); + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, "Couldn't clean up the journal"); + } + + del_timer(&sbi->s_err_report); + ext4_release_system_zone(sb); + ext4_mb_release(sb); + ext4_ext_release(sb); + ext4_xattr_put_super(sb); + + if (!(sb->s_flags & MS_RDONLY)) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } + if (sb->s_dirt || !(sb->s_flags & MS_RDONLY)) + ext4_commit_super(sb, 1); + + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } + kobject_del(&sbi->s_kobj); + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + ext4_kvfree(sbi->s_group_desc); + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + brelse(sbi->s_sbh); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_bdev(sb->s_bdev); + if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->journal_bdev); + invalidate_bdev(sbi->journal_bdev); + ext4_blkdev_remove(sbi); + } + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + sb->s_fs_info = NULL; + /* + * Now that we are completely done shutting down the + * superblock, we need to actually destroy the kobject. + */ + unlock_super(sb); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +} + +static struct kmem_cache *ext4_inode_cachep; + +/* + * Called inside transaction, so use GFP_NOFS + */ +static struct inode *ext4_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; + memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif + ei->jinode = NULL; + INIT_LIST_HEAD(&ei->i_completed_io_list); + spin_lock_init(&ei->i_completed_io_lock); + ei->cur_aio_dio = NULL; + ei->i_sync_tid = 0; + ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); + atomic_set(&ei->i_aiodio_unwritten, 0); + + return &ei->vfs_inode; +} + +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + +static void ext4_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); +} + +static void ext4_destroy_inode(struct inode *inode) +{ + if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): orphan list check failed!", + inode->i_ino, EXT4_I(inode)); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, + EXT4_I(inode), sizeof(struct ext4_inode_info), + true); + dump_stack(); + } + call_rcu(&inode->i_rcu, ext4_i_callback); +} + +static void init_once(void *foo) +{ + struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; + + INIT_LIST_HEAD(&ei->i_orphan); +#ifdef CONFIG_EXT4_FS_XATTR + init_rwsem(&ei->xattr_sem); +#endif + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", + sizeof(struct ext4_inode_info), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + init_once); + if (ext4_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(ext4_inode_cachep); +} + +void ext4_clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + end_writeback(inode); + dquot_drop(inode); + ext4_discard_preallocations(inode); + if (EXT4_I(inode)->jinode) { + jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), + EXT4_I(inode)->jinode); + jbd2_free_inode(EXT4_I(inode)->jinode); + EXT4_I(inode)->jinode = NULL; + } +} + +static struct inode *ext4_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-ESTALE); + if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) + return ERR_PTR(-ESTALE); + + /* iget isn't really right if the inode is currently unallocated!! + * + * ext4_read_inode will return a bad_inode if the inode had been + * deleted, so we should be safe. + * + * Currently we don't know the generation for parent directory, so + * a generation of 0 means "accept any" + */ + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ext4_nfs_get_inode); +} + +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd2 layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + +#ifdef CONFIG_QUOTA +#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) + +static int ext4_write_dquot(struct dquot *dquot); +static int ext4_acquire_dquot(struct dquot *dquot); +static int ext4_release_dquot(struct dquot *dquot); +static int ext4_mark_dquot_dirty(struct dquot *dquot); +static int ext4_write_info(struct super_block *sb, int type); +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path); +static int ext4_quota_off(struct super_block *sb, int type); +static int ext4_quota_on_mount(struct super_block *sb, int type); +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + +static const struct dquot_operations ext4_quota_operations = { + .get_reserved_space = ext4_get_reserved_space, + .write_dquot = ext4_write_dquot, + .acquire_dquot = ext4_acquire_dquot, + .release_dquot = ext4_release_dquot, + .mark_dirty = ext4_mark_dquot_dirty, + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +}; + +static const struct quotactl_ops ext4_qctl_operations = { + .quota_on = ext4_quota_on, + .quota_off = ext4_quota_off, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +#endif + +static const struct super_operations ext4_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, + .evict_inode = ext4_evict_inode, + .put_super = ext4_put_super, + .sync_fs = ext4_sync_fs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct super_operations ext4_nojournal_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, + .evict_inode = ext4_evict_inode, + .write_super = ext4_write_super, + .put_super = ext4_put_super, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + +static const struct export_operations ext4_export_ops = { + .fh_to_dentry = ext4_fh_to_dentry, + .fh_to_parent = ext4_fh_to_parent, + .get_parent = ext4_get_parent, +}; + +enum { + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, + Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_nouid32, Opt_debug, Opt_removed, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, + Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, + Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, + Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, + Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, + Opt_inode_readahead_blks, Opt_journal_ioprio, + Opt_dioread_nolock, Opt_dioread_lock, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, +}; + +static const match_table_t tokens = { + {Opt_bsd_df, "bsddf"}, + {Opt_minix_df, "minixdf"}, + {Opt_grpid, "grpid"}, + {Opt_grpid, "bsdgroups"}, + {Opt_nogrpid, "nogrpid"}, + {Opt_nogrpid, "sysvgroups"}, + {Opt_resgid, "resgid=%u"}, + {Opt_resuid, "resuid=%u"}, + {Opt_sb, "sb=%u"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_nouid32, "nouid32"}, + {Opt_debug, "debug"}, + {Opt_removed, "oldalloc"}, + {Opt_removed, "orlov"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_noload, "norecovery"}, + {Opt_noload, "noload"}, + {Opt_removed, "nobh"}, + {Opt_removed, "bh"}, + {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, + {Opt_journal_checksum, "journal_checksum"}, + {Opt_journal_async_commit, "journal_async_commit"}, + {Opt_abort, "abort"}, + {Opt_data_journal, "data=journal"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_grpquota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_barrier, "barrier=%u"}, + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_i_version, "i_version"}, + {Opt_stripe, "stripe=%u"}, + {Opt_delalloc, "delalloc"}, + {Opt_nodelalloc, "nodelalloc"}, + {Opt_mblk_io_submit, "mblk_io_submit"}, + {Opt_nomblk_io_submit, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, + {Opt_noblock_validity, "noblock_validity"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, + {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "dioread_lock"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, + {Opt_removed, "check=none"}, /* mount option from ext2/3 */ + {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ + {Opt_removed, "reservation"}, /* mount option from ext2/3 */ + {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ + {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ + {Opt_err, NULL}, +}; + +static ext4_fsblk_t get_sb_block(void **data) +{ + ext4_fsblk_t sb_block; + char *options = (char *) *data; + + if (!options || strncmp(options, "sb=", 3) != 0) + return 1; /* Default location */ + + options += 3; + /* TODO: use simple_strtoll with >32bit ext4 */ + sb_block = simple_strtoul(options, &options, 0); + if (*options && *options != ',') { + printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", + (char *) *data); + return 1; + } + if (*options == ',') + options++; + *data = (void *) options; + + return sb_block; +} + +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" + "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; + +#ifdef CONFIG_QUOTA +static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + char *qname; + + if (sb_any_quota_loaded(sb) && + !sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -1; + } + qname = match_strdup(args); + if (!qname) { + ext4_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -1; + } + if (sbi->s_qf_names[qtype] && + strcmp(sbi->s_qf_names[qtype], qname)) { + ext4_msg(sb, KERN_ERR, + "%s quota file already specified", QTYPE2NAME(qtype)); + kfree(qname); + return -1; + } + sbi->s_qf_names[qtype] = qname; + if (strchr(sbi->s_qf_names[qtype], '/')) { + ext4_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return -1; + } + set_opt(sb, QUOTA); + return 1; +} + +static int clear_qf_name(struct super_block *sb, int qtype) +{ + + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sb_any_quota_loaded(sb) && + sbi->s_qf_names[qtype]) { + ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -1; + } + /* + * The space will be released later when all options are confirmed + * to be correct + */ + sbi->s_qf_names[qtype] = NULL; + return 1; +} +#endif + +#define MOPT_SET 0x0001 +#define MOPT_CLEAR 0x0002 +#define MOPT_NOSUPPORT 0x0004 +#define MOPT_EXPLICIT 0x0008 +#define MOPT_CLEAR_ERR 0x0010 +#define MOPT_GTE0 0x0020 +#ifdef CONFIG_QUOTA +#define MOPT_Q 0 +#define MOPT_QFMT 0x0040 +#else +#define MOPT_Q MOPT_NOSUPPORT +#define MOPT_QFMT MOPT_NOSUPPORT +#endif +#define MOPT_DATAJ 0x0080 + +static const struct mount_opts { + int token; + int mount_opt; + int flags; +} ext4_mount_opts[] = { + {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, + {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, + {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, + {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, + {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, + {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, + {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, + {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, + {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, + {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, + {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, + {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, + {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, + {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, + {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, + {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | + EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, + {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, + {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, + {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, + {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, + {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, + {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, + {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_GTE0}, + {Opt_max_batch_time, 0, MOPT_GTE0}, + {Opt_min_batch_time, 0, MOPT_GTE0}, + {Opt_inode_readahead_blks, 0, MOPT_GTE0}, + {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_stripe, 0, MOPT_GTE0}, + {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, + {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, + {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, +#ifdef CONFIG_EXT4_FS_XATTR + {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, + {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, +#else + {Opt_user_xattr, 0, MOPT_NOSUPPORT}, + {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, + {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, +#else + {Opt_acl, 0, MOPT_NOSUPPORT}, + {Opt_noacl, 0, MOPT_NOSUPPORT}, +#endif + {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, + {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, + {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, + {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, + MOPT_SET | MOPT_Q}, + {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | + EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, + {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_err, 0, 0} +}; + +static int handle_mount_opt(struct super_block *sb, char *opt, int token, + substring_t *args, unsigned long *journal_devnum, + unsigned int *journal_ioprio, int is_remount) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + const struct mount_opts *m; + int arg = 0; + +#ifdef CONFIG_QUOTA + if (token == Opt_usrjquota) + return set_qf_name(sb, USRQUOTA, &args[0]); + else if (token == Opt_grpjquota) + return set_qf_name(sb, GRPQUOTA, &args[0]); + else if (token == Opt_offusrjquota) + return clear_qf_name(sb, USRQUOTA); + else if (token == Opt_offgrpjquota) + return clear_qf_name(sb, GRPQUOTA); +#endif + if (args->from && match_int(args, &arg)) + return -1; + switch (token) { + case Opt_noacl: + case Opt_nouser_xattr: + ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); + break; + case Opt_sb: + return 1; /* handled by get_sb_block() */ + case Opt_removed: + ext4_msg(sb, KERN_WARNING, + "Ignoring removed %s option", opt); + return 1; + case Opt_resuid: + sbi->s_resuid = arg; + return 1; + case Opt_resgid: + sbi->s_resgid = arg; + return 1; + case Opt_abort: + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; + return 1; + case Opt_i_version: + sb->s_flags |= MS_I_VERSION; + return 1; + case Opt_journal_dev: + if (is_remount) { + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); + return -1; + } + *journal_devnum = arg; + return 1; + case Opt_journal_ioprio: + if (arg < 0 || arg > 7) + return -1; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + return 1; + } + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + if (token != m->token) + continue; + if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) + return -1; + if (m->flags & MOPT_EXPLICIT) + set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_CLEAR_ERR) + clear_opt(sb, ERRORS_MASK); + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); + return -1; + } + + if (m->flags & MOPT_NOSUPPORT) { + ext4_msg(sb, KERN_ERR, "%s option not supported", opt); + } else if (token == Opt_commit) { + if (arg == 0) + arg = JBD2_DEFAULT_MAX_COMMIT_AGE; + sbi->s_commit_interval = HZ * arg; + } else if (token == Opt_max_batch_time) { + if (arg == 0) + arg = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = arg; + } else if (token == Opt_min_batch_time) { + sbi->s_min_batch_time = arg; + } else if (token == Opt_inode_readahead_blks) { + if (arg > (1 << 30)) + return -1; + if (arg && !is_power_of_2(arg)) { + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks" + " must be a power of 2"); + return -1; + } + sbi->s_inode_readahead_blks = arg; + } else if (token == Opt_init_itable) { + set_opt(sb, INIT_INODE_TABLE); + if (!args->from) + arg = EXT4_DEF_LI_WAIT_MULT; + sbi->s_li_wait_mult = arg; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (m->flags & MOPT_DATAJ) { + if (is_remount) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != + m->mount_opt) { + ext4_msg(sb, KERN_ERR, + "Cannot change data mode on remount"); + return -1; + } + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_mount_opt |= m->mount_opt; + } +#ifdef CONFIG_QUOTA + } else if (m->flags & MOPT_QFMT) { + if (sb_any_quota_loaded(sb) && + sbi->s_jquota_fmt != m->mount_opt) { + ext4_msg(sb, KERN_ERR, "Cannot " + "change journaled quota options " + "when quota turned on"); + return -1; + } + sbi->s_jquota_fmt = m->mount_opt; +#endif + } else { + if (!args->from) + arg = 1; + if (m->flags & MOPT_CLEAR) + arg = !arg; + else if (unlikely(!(m->flags & MOPT_SET))) { + ext4_msg(sb, KERN_WARNING, + "buggy handling of option %s", opt); + WARN_ON(1); + return -1; + } + if (arg != 0) + sbi->s_mount_opt |= m->mount_opt; + else + sbi->s_mount_opt &= ~m->mount_opt; + } + return 1; + } + ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " + "or missing value", opt); + return -1; +} + +static int parse_options(char *options, struct super_block *sb, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, + int is_remount) +{ +#ifdef CONFIG_QUOTA + struct ext4_sb_info *sbi = EXT4_SB(sb); +#endif + char *p; + substring_t args[MAX_OPT_ARGS]; + int token; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + if (handle_mount_opt(sb, p, token, args, journal_devnum, + journal_ioprio, is_remount) < 0) + return 0; + } +#ifdef CONFIG_QUOTA + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sb, USRQUOTA); + + if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sb, GRPQUOTA); + + if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { + ext4_msg(sb, KERN_ERR, "old and new quota " + "format mixing"); + return 0; + } + + if (!sbi->s_jquota_fmt) { + ext4_msg(sb, KERN_ERR, "journaled quota format " + "not specified"); + return 0; + } + } else { + if (sbi->s_jquota_fmt) { + ext4_msg(sb, KERN_ERR, "journaled quota format " + "specified with no journaling " + "enabled"); + return 0; + } + } +#endif + return 1; +} + +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#if defined(CONFIG_QUOTA) + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + + if (test_opt(sb, USRQUOTA)) + seq_puts(seq, ",usrquota"); + + if (test_opt(sb, GRPQUOTA)) + seq_puts(seq, ",grpquota"); +#endif +} + +static const char *token2str(int token) +{ + static const struct match_token *t; + + for (t = tokens; t->token != Opt_err; t++) + if (t->token == token && !strchr(t->pattern, '=')) + break; + return t->pattern; +} + +/* + * Show an option if + * - it's set to a non-default value OR + * - if the per-sb default is different from the global default + */ +static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, + int nodefs) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; + const struct mount_opts *m; + char sep = nodefs ? '\n' : ','; + +#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) +#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) + + if (sbi->s_sb_block != 1) + SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); + + for (m = ext4_mount_opts; m->token != Opt_err; m++) { + int want_set = m->flags & MOPT_SET; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || + (m->flags & MOPT_CLEAR_ERR)) + continue; + if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) + continue; /* skip if same as the default */ + if ((want_set && + (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (sbi->s_mount_opt & m->mount_opt))) + continue; /* select Opt_noFoo vs Opt_Foo */ + SEQ_OPTS_PRINT("%s", token2str(m->token)); + } + + if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || + le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); + if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || + le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); + def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); + if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) + SEQ_OPTS_PUTS("errors=remount-ro"); + if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) + SEQ_OPTS_PUTS("errors=continue"); + if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) + SEQ_OPTS_PUTS("errors=panic"); + if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); + if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) + SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); + if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) + SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (sb->s_flags & MS_I_VERSION) + SEQ_OPTS_PUTS("i_version"); + if (nodefs || sbi->s_stripe) + SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); + if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + SEQ_OPTS_PUTS("data=journal"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + SEQ_OPTS_PUTS("data=ordered"); + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + SEQ_OPTS_PUTS("data=writeback"); + } + if (nodefs || + sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + SEQ_OPTS_PRINT("inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && + (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) + SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + + ext4_show_quota_options(seq, sb); + return 0; +} + +static int ext4_show_options(struct seq_file *seq, struct dentry *root) +{ + return _ext4_show_options(seq, root->d_sb, 0); +} + +static int options_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + int rc; + + seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); + rc = _ext4_show_options(seq, sb, 1); + seq_puts(seq, "\n"); + return rc; +} + +static int options_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, options_seq_show, PDE(inode)->data); +} + +static const struct file_operations ext4_seq_options_fops = { + .owner = THIS_MODULE, + .open = options_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, + int read_only) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { + ext4_msg(sb, KERN_ERR, "revision level too high, " + "forcing read-only mode"); + res = MS_RDONLY; + } + if (read_only) + goto done; + if (!(sbi->s_mount_state & EXT4_VALID_FS)) + ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " + "running e2fsck is recommended"); + else if ((sbi->s_mount_state & EXT4_ERROR_FS)) + ext4_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + ext4_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= get_seconds())) + ext4_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); + le16_add_cpu(&es->s_mnt_count, 1); + es->s_mtime = cpu_to_le32(get_seconds()); + ext4_update_dynamic_rev(sb); + if (sbi->s_journal) + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + + ext4_commit_super(sb, 1); +done: + if (test_opt(sb, DEBUG)) + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " + "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", + sb->s_blocksize, + sbi->s_groups_count, + EXT4_BLOCKS_PER_GROUP(sb), + EXT4_INODES_PER_GROUP(sb), + sbi->s_mount_opt, sbi->s_mount_opt2); + + cleancache_init_fs(sb); + return res; +} + +static int ext4_fill_flex_info(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; + ext4_group_t flex_group_count; + ext4_group_t flex_group; + unsigned int groups_per_flex = 0; + size_t size; + int i; + + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; + if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + groups_per_flex = 1 << sbi->s_log_groups_per_flex; + + /* We allocate both existing and potentially added groups */ + flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; + size = flex_group_count * sizeof(struct flex_groups); + sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", + flex_group_count); + goto failed; + } + + for (i = 0; i < sbi->s_groups_count; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + + flex_group = ext4_flex_group(sbi, i); + atomic_add(ext4_free_inodes_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_inodes); + atomic_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); + atomic_add(ext4_used_dirs_count(sb, gdp), + &sbi->s_flex_groups[flex_group].used_dirs); + } + + return 1; +failed: + return 0; +} + +__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + __u16 crc = 0; + + if (sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + int offset = offsetof(struct ext4_group_desc, bg_checksum); + __le32 le_group = cpu_to_le32(block_group); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + } + + return cpu_to_le16(crc); +} + +int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if ((sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && + (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) + return 0; + + return 1; +} + +/* Called at mount-time, super-block is locked */ +static int ext4_check_descriptors(struct super_block *sb, + ext4_group_t *first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext4_fsblk_t last_block; + ext4_fsblk_t block_bitmap; + ext4_fsblk_t inode_bitmap; + ext4_fsblk_t inode_table; + int flexbg_flag = 0; + ext4_group_t i, grp = sbi->s_groups_count; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + flexbg_flag = 1; + + ext4_debug("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) { + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); + + if (i == sbi->s_groups_count - 1 || flexbg_flag) + last_block = ext4_blocks_count(sbi->s_es) - 1; + else + last_block = first_block + + (EXT4_BLOCKS_PER_GROUP(sb) - 1); + + if ((grp == sbi->s_groups_count) && + !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + grp = i; + + block_bitmap = ext4_block_bitmap(sb, gdp); + if (block_bitmap < first_block || block_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Block bitmap for group %u not in group " + "(block %llu)!", i, block_bitmap); + return 0; + } + inode_bitmap = ext4_inode_bitmap(sb, gdp); + if (inode_bitmap < first_block || inode_bitmap > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode bitmap for group %u not in group " + "(block %llu)!", i, inode_bitmap); + return 0; + } + inode_table = ext4_inode_table(sb, gdp); + if (inode_table < first_block || + inode_table + sbi->s_itb_per_group - 1 > last_block) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Inode table for group %u not in group " + "(block %llu)!", i, inode_table); + return 0; + } + ext4_lock_group(sb, i); + if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Checksum for group %u failed (%u!=%u)", + i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); + if (!(sb->s_flags & MS_RDONLY)) { + ext4_unlock_group(sb, i); + return 0; + } + } + ext4_unlock_group(sb, i); + if (!flexbg_flag) + first_block += EXT4_BLOCKS_PER_GROUP(sb); + } + if (NULL != first_not_zeroed) + *first_not_zeroed = grp; + + ext4_free_blocks_count_set(sbi->s_es, + EXT4_C2B(sbi, ext4_count_free_clusters(sb))); + sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); + return 1; +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +static void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & MS_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~MS_RDONLY; + } +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + if (ret < 0) + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +#endif + + while (es->s_last_orphan) { + struct inode *inode; + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + dquot_initialize(inode); + if (inode->i_nlink) { + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + ext4_truncate(inode); + nr_truncates++; + } else { + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } +#endif + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ +} + +/* + * Maximal extent format file size. + * Resulting logical blkno at s_maxbytes must fit in our on-disk + * extent format containers, within a sector_t, and within i_blocks + * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, + * so that won't be a limiting factor. + * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * + * Note, this does *not* consider any metadata overhead for vfs i_blocks. + */ +static loff_t ext4_max_size(int blkbits, int has_huge_files) +{ + loff_t res; + loff_t upper_limit = MAX_LFS_FILESIZE; + + /* small i_blocks in vfs inode? */ + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * CONFIG_LBDAF is not enabled implies the inode + * i_block represent total blocks in 512 bytes + * 32 == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (blkbits - 9); + upper_limit <<= blkbits; + } + + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; + res <<= blkbits; + + /* Sanity check against vm- & vfs- imposed limits */ + if (res > upper_limit) + res = upper_limit; + + return res; +} + +/* + * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^48 sector limit. + */ +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) +{ + loff_t res = EXT4_NDIR_BLOCKS; + int meta_blocks; + loff_t upper_limit; + /* This is calculated to be the largest file size for a dense, block + * mapped file such that the file's total number of 512-byte sectors, + * including data and all indirect blocks, does not exceed (2^48 - 1). + * + * __u32 i_blocks_lo and _u16 i_blocks_high represent the total + * number of 512-byte sectors of the file. + */ + + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { + /* + * !has_huge_files or CONFIG_LBDAF not enabled implies that + * the inode i_block field represents total file blocks in + * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 + */ + upper_limit = (1LL << 32) - 1; + + /* total blocks in file system block size */ + upper_limit >>= (bits - 9); + + } else { + /* + * We use 48 bit ext4_inode i_blocks + * With EXT4_HUGE_FILE_FL set the i_blocks + * represent total number of blocks in + * file system block size + */ + upper_limit = (1LL << 48) - 1; + + } + + /* indirect blocks */ + meta_blocks = 1; + /* double indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)); + /* tripple indirect blocks */ + meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); + + upper_limit -= meta_blocks; + upper_limit <<= bits; + + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > upper_limit) + res = upper_limit; + + if (res > MAX_LFS_FILESIZE) + res = MAX_LFS_FILESIZE; + + return res; +} + +static ext4_fsblk_t descriptor_loc(struct super_block *sb, + ext4_fsblk_t logical_sb_block, int nr) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t bg, first_meta_bg; + int has_super = 0; + + first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + nr < first_meta_bg) + return logical_sb_block + nr + 1; + bg = sbi->s_desc_per_block * nr; + if (ext4_bg_has_super(sb, bg)) + has_super = 1; + + return (has_super + ext4_group_first_block_no(sb, bg)); +} + +/** + * ext4_get_stripe_size: Get the stripe size. + * @sbi: In memory super block info + * + * If we have specified it via mount option, then + * use the mount option value. If the value specified at mount time is + * greater than the blocks per group use the super block value. + * If the super block value is greater than blocks per group return 0. + * Allocator needs it be less than blocks per group. + * + */ +static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) +{ + unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); + unsigned long stripe_width = + le32_to_cpu(sbi->s_es->s_raid_stripe_width); + int ret; + + if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) + ret = sbi->s_stripe; + else if (stripe_width <= sbi->s_blocks_per_group) + ret = stripe_width; + else if (stride <= sbi->s_blocks_per_group) + ret = stride; + else + ret = 0; + + /* + * If the stripe width is 1, this makes no sense and + * we set it to 0 to turn off stripe handling code. + */ + if (ret <= 1) + ret = 0; + + return ret; +} + +/* sysfs supprt */ + +struct ext4_attr { + struct attribute attr; + ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); + ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, + const char *, size_t); + int offset; +}; + +static int parse_strtoul(const char *buf, + unsigned long max, unsigned long *value) +{ + char *endp; + + *value = simple_strtoul(skip_spaces(buf), &endp, 0); + endp = skip_spaces(endp); + if (*endp || *value > max) + return -EINVAL; + + return 0; +} + +static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); +} + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1))); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + + if (parse_strtoul(buf, 0x40000000, &t)) + return -EINVAL; + + if (t && !is_power_of_2(t)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t sbi_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t sbi_ui_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); + unsigned long t; + + if (parse_strtoul(buf, 0xffffffff, &t)) + return -EINVAL; + *ui = t; + return count; +} + +#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .offset = offsetof(struct ext4_sb_info, _elname), \ +} +#define EXT4_ATTR(name, mode, show, store) \ +static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) + +#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) +#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) +#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) +#define EXT4_RW_ATTR_SBI_UI(name, elname) \ + EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_RO_ATTR(delayed_allocation_blocks); +EXT4_RO_ATTR(session_write_kbytes); +EXT4_RO_ATTR(lifetime_write_kbytes); +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, + inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + NULL, +}; + +/* Features this copy of ext4 supports */ +EXT4_INFO_ATTR(lazy_itable_init); +EXT4_INFO_ATTR(batched_discard); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + NULL, +}; + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + +static void ext4_feat_release(struct kobject *kobj) +{ + complete(&ext4_feat->f_kobj_unregister); +} + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_feat_release, +}; + +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +static int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (readonly) + return 1; + + /* Check that feature set is OK for a read-write mount */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + /* + * Large file size enabled file system can only be mounted + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (sizeof(blkcnt_t) < sizeof(u64)) { + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " + "cannot be mounted RDWR without " + "CONFIG_LBDAF"); + return 0; + } + } + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + ext4_msg(sb, KERN_ERR, + "Can't support bigalloc feature without " + "extents feature\n"); + return 0; + } + return 1; +} + +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(unsigned long arg) +{ + struct super_block *sb = (struct super_block *) arg; + struct ext4_sb_info *sbi; + struct ext4_super_block *es; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (es->s_error_count) + ext4_msg(sb, KERN_NOTICE, "error count: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk("\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk("\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + +/* Find next suitable group and run ext4_init_inode_table */ +static int ext4_run_li_request(struct ext4_li_request *elr) +{ + struct ext4_group_desc *gdp = NULL; + ext4_group_t group, ngroups; + struct super_block *sb; + unsigned long timeout = 0; + int ret = 0; + + sb = elr->lr_super; + ngroups = EXT4_SB(sb)->s_groups_count; + + for (group = elr->lr_next_group; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) { + ret = 1; + break; + } + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + if (group == ngroups) + ret = 1; + + if (!ret) { + timeout = jiffies; + ret = ext4_init_inode_table(sb, group, + elr->lr_timeout ? 0 : 1); + if (elr->lr_timeout == 0) { + timeout = (jiffies - timeout) * + elr->lr_sbi->s_li_wait_mult; + elr->lr_timeout = timeout; + } + elr->lr_next_sched = jiffies + elr->lr_timeout; + elr->lr_next_group = group + 1; + } + + return ret; +} + +/* + * Remove lr_request from the list_request and free the + * request structure. Should be called with li_list_mtx held + */ +static void ext4_remove_li_request(struct ext4_li_request *elr) +{ + struct ext4_sb_info *sbi; + + if (!elr) + return; + + sbi = elr->lr_sbi; + + list_del(&elr->lr_request); + sbi->s_li_request = NULL; + kfree(elr); +} + +static void ext4_unregister_li_request(struct super_block *sb) +{ + mutex_lock(&ext4_li_mtx); + if (!ext4_li_info) { + mutex_unlock(&ext4_li_mtx); + return; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + ext4_remove_li_request(EXT4_SB(sb)->s_li_request); + mutex_unlock(&ext4_li_info->li_list_mtx); + mutex_unlock(&ext4_li_mtx); +} + +static struct task_struct *ext4_lazyinit_task; + +/* + * This is the function where ext4lazyinit thread lives. It walks + * through the request list searching for next scheduled filesystem. + * When such a fs is found, run the lazy initialization request + * (ext4_rn_li_request) and keep track of the time spend in this + * function. Based on that time we compute next schedule time of + * the request. When walking through the list is complete, compute + * next waking time and put itself into sleep. + */ +static int ext4_lazyinit_thread(void *arg) +{ + struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct list_head *pos, *n; + struct ext4_li_request *elr; + unsigned long next_wakeup, cur; + + BUG_ON(NULL == eli); + +cont_thread: + while (true) { + next_wakeup = MAX_JIFFY_OFFSET; + + mutex_lock(&eli->li_list_mtx); + if (list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + goto exit_thread; + } + + list_for_each_safe(pos, n, &eli->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } + } + + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + } + mutex_unlock(&eli->li_list_mtx); + + try_to_freeze(); + + cur = jiffies; + if ((time_after_eq(cur, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { + cond_resched(); + continue; + } + + schedule_timeout_interruptible(next_wakeup - cur); + + if (kthread_should_stop()) { + ext4_clear_request_list(); + goto exit_thread; + } + } + +exit_thread: + /* + * It looks like the request list is empty, but we need + * to check it under the li_list_mtx lock, to prevent any + * additions into it, and of course we should lock ext4_li_mtx + * to atomically free the list and ext4_li_info, because at + * this point another ext4 filesystem could be registering + * new one. + */ + mutex_lock(&ext4_li_mtx); + mutex_lock(&eli->li_list_mtx); + if (!list_empty(&eli->li_request_list)) { + mutex_unlock(&eli->li_list_mtx); + mutex_unlock(&ext4_li_mtx); + goto cont_thread; + } + mutex_unlock(&eli->li_list_mtx); + kfree(ext4_li_info); + ext4_li_info = NULL; + mutex_unlock(&ext4_li_mtx); + + return 0; +} + +static void ext4_clear_request_list(void) +{ + struct list_head *pos, *n; + struct ext4_li_request *elr; + + mutex_lock(&ext4_li_info->li_list_mtx); + list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { + elr = list_entry(pos, struct ext4_li_request, + lr_request); + ext4_remove_li_request(elr); + } + mutex_unlock(&ext4_li_info->li_list_mtx); +} + +static int ext4_run_lazyinit_thread(void) +{ + ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, + ext4_li_info, "ext4lazyinit"); + if (IS_ERR(ext4_lazyinit_task)) { + int err = PTR_ERR(ext4_lazyinit_task); + ext4_clear_request_list(); + kfree(ext4_li_info); + ext4_li_info = NULL; + printk(KERN_CRIT "EXT4-fs: error %d creating inode table " + "initialization thread\n", + err); + return err; + } + ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; + return 0; +} + +/* + * Check whether it make sense to run itable init. thread or not. + * If there is at least one uninitialized inode table, return + * corresponding group number, else the loop goes through all + * groups and return total number of groups. + */ +static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) +{ + ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; + struct ext4_group_desc *gdp = NULL; + + for (group = 0; group < ngroups; group++) { + gdp = ext4_get_group_desc(sb, group, NULL); + if (!gdp) + continue; + + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) + break; + } + + return group; +} + +static int ext4_li_info_new(void) +{ + struct ext4_lazy_init *eli = NULL; + + eli = kzalloc(sizeof(*eli), GFP_KERNEL); + if (!eli) + return -ENOMEM; + + INIT_LIST_HEAD(&eli->li_request_list); + mutex_init(&eli->li_list_mtx); + + eli->li_state |= EXT4_LAZYINIT_QUIT; + + ext4_li_info = eli; + + return 0; +} + +static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, + ext4_group_t start) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + unsigned long rnd; + + elr = kzalloc(sizeof(*elr), GFP_KERNEL); + if (!elr) + return NULL; + + elr->lr_super = sb; + elr->lr_sbi = sbi; + elr->lr_next_group = start; + + /* + * Randomize first schedule time of the request to + * spread the inode table initialization requests + * better. + */ + get_random_bytes(&rnd, sizeof(rnd)); + elr->lr_next_sched = jiffies + (unsigned long)rnd % + (EXT4_DEF_LI_MAX_START_DELAY * HZ); + + return elr; +} + +static int ext4_register_li_request(struct super_block *sb, + ext4_group_t first_not_zeroed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_li_request *elr; + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + int ret = 0; + + if (sbi->s_li_request != NULL) { + /* + * Reset timeout so it can be computed again, because + * s_li_wait_mult might have changed. + */ + sbi->s_li_request->lr_timeout = 0; + return 0; + } + + if (first_not_zeroed == ngroups || + (sb->s_flags & MS_RDONLY) || + !test_opt(sb, INIT_INODE_TABLE)) + return 0; + + elr = ext4_li_request_new(sb, first_not_zeroed); + if (!elr) + return -ENOMEM; + + mutex_lock(&ext4_li_mtx); + + if (NULL == ext4_li_info) { + ret = ext4_li_info_new(); + if (ret) + goto out; + } + + mutex_lock(&ext4_li_info->li_list_mtx); + list_add(&elr->lr_request, &ext4_li_info->li_request_list); + mutex_unlock(&ext4_li_info->li_list_mtx); + + sbi->s_li_request = elr; + /* + * set elr to NULL here since it has been inserted to + * the request_list and the removal and free of it is + * handled by ext4_clear_request_list from now on. + */ + elr = NULL; + + if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { + ret = ext4_run_lazyinit_thread(); + if (ret) + goto out; + } +out: + mutex_unlock(&ext4_li_mtx); + if (ret) + kfree(elr); + return ret; +} + +/* + * We do not need to lock anything since this is called on + * module unload. + */ +static void ext4_destroy_lazyinit_thread(void) +{ + /* + * If thread exited earlier + * there's nothing to be done. + */ + if (!ext4_li_info || !ext4_lazyinit_task) + return; + + kthread_stop(ext4_lazyinit_task); +} + +static int ext4_fill_super(struct super_block *sb, void *data, int silent) +{ + char *orig_data = kstrdup(data, GFP_KERNEL); + struct buffer_head *bh; + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi; + ext4_fsblk_t block; + ext4_fsblk_t sb_block = get_sb_block(&data); + ext4_fsblk_t logical_sb_block; + unsigned long offset = 0; + unsigned long journal_devnum = 0; + unsigned long def_mount_opts; + struct inode *root; + char *cp; + const char *descr; + int ret = -ENOMEM; + int blocksize, clustersize; + unsigned int db_count; + unsigned int i; + int needs_recovery, has_huge_files, has_bigalloc; + __u64 blocks_count; + int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ext4_group_t first_not_zeroed; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + goto out_free_orig; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + goto out_free_orig; + } + sb->s_fs_info = sbi; + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT4_DEF_RESUID; + sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; + sbi->s_sb_block = sb_block; + if (sb->s_bdev->bd_part) + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev->bd_part, sectors[1]); + + /* Cleanup superblock name */ + for (cp = sb->s_id; (cp = strchr(cp, '/'));) + *cp = '!'; + + ret = -EINVAL; + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); + if (!blocksize) { + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); + goto out_fail; + } + + /* + * The ext4 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT4_MIN_BLOCK_SIZE) { + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + } else { + logical_sb_block = sb_block; + } + + if (!(bh = sb_bread(sb, logical_sb_block))) { + ext4_msg(sb, KERN_ERR, "unable to read superblock"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext4 macro-instructions depend on its value + */ + es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT4_SUPER_MAGIC) + goto cantfind_ext4; + sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + + /* Set defaults before we parse the mount options */ + def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + set_opt(sb, INIT_INODE_TABLE); + if (def_mount_opts & EXT4_DEFM_DEBUG) + set_opt(sb, DEBUG); + if (def_mount_opts & EXT4_DEFM_BSDGROUPS) + set_opt(sb, GRPID); + if (def_mount_opts & EXT4_DEFM_UID16) + set_opt(sb, NO_UID32); + /* xattr user namespace & acls are now defaulted on */ +#ifdef CONFIG_EXT4_FS_XATTR + set_opt(sb, XATTR_USER); +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + set_opt(sb, POSIX_ACL); +#endif + set_opt(sb, MBLK_IO_SUBMIT); + if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) + set_opt(sb, JOURNAL_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) + set_opt(sb, ORDERED_DATA); + else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) + set_opt(sb, WRITEBACK_DATA); + + if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) + set_opt(sb, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) + set_opt(sb, ERRORS_CONT); + else + set_opt(sb, ERRORS_RO); + if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) + set_opt(sb, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sb, DISCARD); + + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sb, BARRIER); + + /* + * enable delayed allocation by default + * Use -o nodelalloc to turn it off + */ + if (!IS_EXT3_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) + set_opt(sb, DELALLOC); + + /* + * set default s_li_wait_mult for lazyinit, for the case there is + * no mount option specified. + */ + sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, + &journal_devnum, &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + sbi->s_es->s_mount_opts); + } + sbi->s_def_mount_opt = sbi->s_mount_opt; + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, 0)) + goto failed_mount; + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " + "with data=journal disables delayed " + "allocation and O_DIRECT support!\n"); + if (test_opt2(sb, EXPLICIT_DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DIOREAD_NOLOCK)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and delalloc"); + goto failed_mount; + } + if (test_opt(sb, DELALLOC)) + clear_opt(sb, DELALLOC); + } + + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (test_opt(sb, DIOREAD_NOLOCK)) { + if (blocksize < PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + goto failed_mount; + } + } + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && + (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) + ext4_msg(sb, KERN_WARNING, + "feature flags set on rev 0 fs, " + "running e2fsck is recommended"); + + if (IS_EXT2_SB(sb)) { + if (ext2_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext2 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + if (IS_EXT3_SB(sb)) { + if (ext3_feature_set_ok(sb)) + ext4_msg(sb, KERN_INFO, "mounting ext3 file system " + "using the ext4 subsystem"); + else { + ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " + "to feature incompatibilities"); + goto failed_mount; + } + } + + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) + goto failed_mount; + + if (blocksize < EXT4_MIN_BLOCK_SIZE || + blocksize > EXT4_MAX_BLOCK_SIZE) { + ext4_msg(sb, KERN_ERR, + "Unsupported filesystem blocksize %d", blocksize); + goto failed_mount; + } + + if (sb->s_blocksize != blocksize) { + /* Validate the filesystem blocksize */ + if (!sb_set_blocksize(sb, blocksize)) { + ext4_msg(sb, KERN_ERR, "bad block size %d", + blocksize); + goto failed_mount; + } + + brelse(bh); + logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; + offset = do_div(logical_sb_block, blocksize); + bh = sb_bread(sb, logical_sb_block); + if (!bh) { + ext4_msg(sb, KERN_ERR, + "Can't read superblock on 2nd try"); + goto failed_mount; + } + es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { + ext4_msg(sb, KERN_ERR, + "Magic mismatch, very weird!"); + goto failed_mount; + } + } + + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) + sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); + } + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + goto failed_mount; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0) + goto cantfind_ext4; + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); + sbi->s_sbh = bh; + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + + /* Handle clustersize */ + clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); + has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_BIGALLOC); + if (has_bigalloc) { + if (clustersize < blocksize) { + ext4_msg(sb, KERN_ERR, + "cluster size (%d) smaller than " + "block size (%d)", clustersize, blocksize); + goto failed_mount; + } + sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - + le32_to_cpu(es->s_log_block_size); + sbi->s_clusters_per_group = + le32_to_cpu(es->s_clusters_per_group); + if (sbi->s_clusters_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#clusters per group too big: %lu", + sbi->s_clusters_per_group); + goto failed_mount; + } + if (sbi->s_blocks_per_group != + (sbi->s_clusters_per_group * (clustersize / blocksize))) { + ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " + "clusters per group (%lu) inconsistent", + sbi->s_blocks_per_group, + sbi->s_clusters_per_group); + goto failed_mount; + } + } else { + if (clustersize != blocksize) { + ext4_warning(sb, "fragment/cluster size (%d) != " + "block size (%d)", clustersize, + blocksize); + clustersize = blocksize; + } + if (sbi->s_blocks_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", + sbi->s_blocks_per_group); + goto failed_mount; + } + sbi->s_clusters_per_group = sbi->s_blocks_per_group; + sbi->s_cluster_bits = 0; + } + sbi->s_cluster_ratio = clustersize / blocksize; + + if (sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, + "#inodes per group too big: %lu", + sbi->s_inodes_per_group); + goto failed_mount; + } + + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + if (sizeof(sector_t) < 8) + ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; + goto failed_mount; + } + + if (EXT4_BLOCKS_PER_GROUP(sb) == 0) + goto cantfind_ext4; + + /* check blocks count against device size */ + blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", + ext4_blocks_count(es), blocks_count); + goto failed_mount; + } + + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + ext4_msg(sb, KERN_WARNING, "bad geometry: first data " + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); + goto failed_mount; + } + blocks_count = (ext4_blocks_count(es) - + le32_to_cpu(es->s_first_data_block) + + EXT4_BLOCKS_PER_GROUP(sb) - 1); + do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + ext4_msg(sb, KERN_WARNING, "groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } + sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + sbi->s_group_desc = ext4_kvmalloc(db_count * + sizeof(struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + goto failed_mount; + } + + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) + proc_create_data("options", S_IRUGO, sbi->s_proc, + &ext4_seq_options_fops, sb); + + bgl_lock_init(sbi->s_blockgroup_lock); + + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sbi->s_group_desc[i] = sb_bread(sb, block); + if (!sbi->s_group_desc[i]) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + goto failed_mount2; + } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); + goto failed_mount2; + } + + sbi->s_gdb_count = db_count; + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + + init_timer(&sbi->s_err_report); + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + + err = percpu_counter_init(&sbi->s_freeclusters_counter, + ext4_count_free_clusters(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + + sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; + + /* + * set up enough so that it can read an inode + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + sb->s_export_op = &ext4_export_ops; + sb->s_xattr = ext4_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->s_qcop = &ext4_qctl_operations; + sb->dq_op = &ext4_quota_operations; +#endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + sbi->s_resize_flags = 0; + + sb->s_root = NULL; + + needs_recovery = (es->s_last_orphan != 0 || + EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_RECOVER)); + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && + !(sb->s_flags & MS_RDONLY)) + if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) + goto failed_mount3; + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext4_load_journal(sb, es, journal_devnum)) + goto failed_mount3; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + ext4_msg(sb, KERN_ERR, "required journal recovery " + "suppressed and not mounted read-only"); + goto failed_mount_wq; + } else { + clear_opt(sb, DATA_FLAGS); + sbi->s_journal = NULL; + needs_recovery = 0; + goto no_journal; + } + + if (ext4_blocks_count(es) > 0xffffffffULL && + !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_64BIT)) { + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); + goto failed_mount_wq; + } + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + * capabilities: ORDERED_DATA if the journal can + * cope, else JOURNAL_DATA + */ + if (jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) + set_opt(sb, ORDERED_DATA); + else + set_opt(sb, JOURNAL_DATA); + break; + + case EXT4_MOUNT_ORDERED_DATA: + case EXT4_MOUNT_WRITEBACK_DATA: + if (!jbd2_journal_check_available_features + (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); + goto failed_mount_wq; + } + default: + break; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeclusters_counter, + ext4_count_free_clusters(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); + +no_journal: + /* + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ + EXT4_SB(sb)->dio_unwritten_wq = + alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->dio_unwritten_wq) { + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + goto failed_mount_wq; + } + + /* + * The jbd2_journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + root = ext4_iget(sb, EXT4_ROOT_INO); + if (IS_ERR(root)) { + ext4_msg(sb, KERN_ERR, "get root inode failed"); + ret = PTR_ERR(root); + root = NULL; + goto failed_mount4; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); + iput(root); + goto failed_mount4; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + ext4_msg(sb, KERN_ERR, "get root dentry failed"); + ret = -ENOMEM; + goto failed_mount4; + } + + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + ext4_msg(sb, KERN_INFO, "required extra inode space not" + "available"); + } + + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount5; + } + + err = ext4_register_li_request(sb, first_not_zeroed); + if (err) + goto failed_mount6; + + sbi->s_kobj.kset = ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, + "%s", sb->s_id); + if (err) + goto failed_mount7; + + EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; + ext4_orphan_cleanup(sb, es); + EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; + if (needs_recovery) { + ext4_msg(sb, KERN_INFO, "recovery complete"); + ext4_mark_recovery_complete(sb, es); + } + if (EXT4_SB(sb)->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ + + kfree(orig_data); + return 0; + +cantfind_ext4: + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + goto failed_mount; + +failed_mount7: + ext4_unregister_li_request(sb); +failed_mount6: + ext4_mb_release(sb); +failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); +failed_mount4a: + dput(sb->s_root); + sb->s_root = NULL; +failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); +failed_mount_wq: + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } +failed_mount3: + del_timer(&sbi->s_err_report); + if (sbi->s_flex_groups) + ext4_kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); +failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + ext4_kvfree(sbi->s_group_desc); +failed_mount: + if (sbi->s_proc) { + remove_proc_entry("options", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif + ext4_blkdev_remove(sbi); + brelse(bh); +out_fail: + sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); + kfree(sbi); +out_free_orig: + kfree(orig_data); + return ret; +} + +/* + * Setup any per-fs journal parameters now. We'll do this both on + * initial mount, once the journal has been initialised but before we've + * done any recovery; and again on any subsequent remount. + */ +static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; + + write_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JBD2_BARRIER; + else + journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; + write_unlock(&journal->j_state_lock); +} + +static journal_t *ext4_get_journal(struct super_block *sb, + unsigned int journal_inum) +{ + struct inode *journal_inode; + journal_t *journal; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = ext4_iget(sb, journal_inum); + if (IS_ERR(journal_inode)) { + ext4_msg(sb, KERN_ERR, "no journal found"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + ext4_msg(sb, KERN_ERR, "journal inode is deleted"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); + if (!S_ISREG(journal_inode->i_mode)) { + ext4_msg(sb, KERN_ERR, "invalid journal inode"); + iput(journal_inode); + return NULL; + } + + journal = jbd2_journal_init_inode(journal_inode); + if (!journal) { + ext4_msg(sb, KERN_ERR, "Could not load journal inode"); + iput(journal_inode); + return NULL; + } + journal->j_private = sb; + ext4_init_journal_params(sb, journal); + return journal; +} + +static journal_t *ext4_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + struct buffer_head *bh; + journal_t *journal; + ext4_fsblk_t start; + ext4_fsblk_t len; + int hblock, blocksize; + ext4_fsblk_t sb_block; + unsigned long offset; + struct ext4_super_block *es; + struct block_device *bdev; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + bdev = ext4_blkdev_get(j_dev, sb); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = bdev_logical_block_size(bdev); + if (blocksize < hblock) { + ext4_msg(sb, KERN_ERR, + "blocksize too small for journal device"); + goto out_bdev; + } + + sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; + offset = EXT4_MIN_BLOCK_SIZE % blocksize; + set_blocksize(bdev, blocksize); + if (!(bh = __bread(bdev, sb_block, blocksize))) { + ext4_msg(sb, KERN_ERR, "couldn't read superblock of " + "external journal"); + goto out_bdev; + } + + es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "bad superblock"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + ext4_msg(sb, KERN_ERR, "journal UUID does not match"); + brelse(bh); + goto out_bdev; + } + + len = ext4_blocks_count(es); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = jbd2_journal_init_dev(bdev, sb->s_bdev, + start, len, blocksize); + if (!journal) { + ext4_msg(sb, KERN_ERR, "failed to create device journal"); + goto out_bdev; + } + journal->j_private = sb; + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + ext4_msg(sb, KERN_ERR, "I/O error on journal device"); + goto out_journal; + } + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + ext4_msg(sb, KERN_ERR, "External journal has more than one " + "user (unsupported) - %d", + be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT4_SB(sb)->journal_bdev = bdev; + ext4_init_journal_params(sb, journal); + return journal; + +out_journal: + jbd2_journal_destroy(journal); +out_bdev: + ext4_blkdev_put(bdev); + return NULL; +} + +static int ext4_load_journal(struct super_block *sb, + struct ext4_super_block *es, + unsigned long journal_devnum) +{ + journal_t *journal; + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); + dev_t journal_dev; + int err = 0; + int really_read_only; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + ext4_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + + really_read_only = bdev_read_only(sb->s_bdev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + ext4_msg(sb, KERN_INFO, "INFO: recovery " + "required on readonly filesystem"); + if (really_read_only) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, cannot proceed"); + return -EROFS; + } + ext4_msg(sb, KERN_INFO, "write access will " + "be enabled during recovery"); + } + } + + if (journal_inum && journal_dev) { + ext4_msg(sb, KERN_ERR, "filesystem has both journal " + "and inode journals!"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext4_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext4_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + if (!(journal->j_flags & JBD2_BARRIER)) + ext4_msg(sb, KERN_INFO, "barriers disabled"); + + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) + err = jbd2_journal_wipe(journal, !really_read_only); + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); + err = jbd2_journal_load(journal); + if (save) + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + kfree(save); + } + + if (err) { + ext4_msg(sb, KERN_ERR, "error loading journal"); + jbd2_journal_destroy(journal); + return err; + } + + EXT4_SB(sb)->s_journal = journal; + ext4_clear_journal_err(sb, es); + + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + + /* Make sure we flush the recovery flag to disk. */ + ext4_commit_super(sb, 1); + } + + return 0; +} + +static int ext4_commit_super(struct super_block *sb, int sync) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; + + if (!sbh || block_device_ejected(sb)) + return error; + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); + if (sb->s_bdev->bd_part) + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); + else + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); + ext4_free_blocks_count_set(es, + EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeclusters_counter))); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); + sb->s_dirt = 0; + BUFFER_TRACE(sbh, "marking dirty"); + mark_buffer_dirty(sbh); + if (sync) { + error = sync_dirty_buffer(sbh); + if (error) + return error; + + error = buffer_write_io_error(sbh); + if (error) { + ext4_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } + return error; +} + +/* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + BUG_ON(journal != NULL); + return; + } + jbd2_journal_lock_updates(journal); + if (jbd2_journal_flush(journal) < 0) + goto out; + + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + } + +out: + jbd2_journal_unlock_updates(journal); +} + +/* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) +{ + journal_t *journal; + int j_errno; + const char *errstr; + + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + + journal = EXT4_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext4_error() or ext4_abort() + */ + + j_errno = jbd2_journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext4_decode_error(sb, j_errno, nbuf); + ext4_warning(sb, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext4_warning(sb, "Marking fs in need of filesystem check."); + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, 1); + + jbd2_journal_clear_err(journal); + } +} + +/* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ +int ext4_force_commit(struct super_block *sb) +{ + journal_t *journal; + int ret = 0; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + if (journal) { + vfs_check_frozen(sb, SB_FREEZE_TRANS); + ret = ext4_journal_force_commit(journal); + } + + return ret; +} + +static void ext4_write_super(struct super_block *sb) +{ + lock_super(sb); + ext4_commit_super(sb, 1); + unlock_super(sb); +} + +static int ext4_sync_fs(struct super_block *sb, int wait) +{ + int ret = 0; + tid_t target; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + trace_ext4_sync_fs(sb, wait); + flush_workqueue(sbi->dio_unwritten_wq); + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + jbd2_log_wait_commit(sbi->s_journal, target); + } + return ret; +} + +/* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + * + * Note that only this function cannot bring a filesystem to be in a clean + * state independently, because ext4 prevents a new handle from being started + * by @sb->s_frozen, which stays in an upper layer. It thus needs help from + * the upper layer. + */ +static int ext4_freeze(struct super_block *sb) +{ + int error = 0; + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT4_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); + + /* + * Don't clear the needs_recovery flag if we failed to flush + * the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); +out: + /* we rely on s_frozen to stop further updates */ + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + return error; +} + +/* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ +static int ext4_unfreeze(struct super_block *sb) +{ + if (sb->s_flags & MS_RDONLY) + return 0; + + lock_super(sb); + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + unlock_super(sb); + return 0; +} + +/* + * Structure to save mount options for ext4_remount's benefit + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + unsigned long s_mount_opt2; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +static int ext4_remount(struct super_block *sb, int *flags, char *data) +{ + struct ext4_super_block *es; + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long old_sb_flags; + struct ext4_mount_options old_opts; + int enable_quota = 0; + ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + int err = 0; +#ifdef CONFIG_QUOTA + int i; +#endif + char *orig_data = kstrdup(data, GFP_KERNEL); + + /* Store the original options */ + lock_super(sb); + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_mount_opt2 = sbi->s_mount_opt2; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) + old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +#endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { + err = -EINVAL; + goto restore_opts; + } + + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) + ext4_abort(sb, "Abort forced by user"); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + + es = sbi->s_es; + + if (sbi->s_journal) { + ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { + err = -EROFS; + goto restore_opts; + } + + if (*flags & MS_RDONLY) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && + (sbi->s_mount_state & EXT4_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + if (sbi->s_journal) + ext4_mark_recovery_complete(sb, es); + } else { + /* Make sure we can mount this feature set readwrite */ + if (!ext4_feature_set_ok(sb, 0)) { + err = -EROFS; + goto restore_opts; + } + /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + ext4_msg(sb, KERN_ERR, + "ext4_remount: Checksum for group %u failed (%u!=%u)", + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EINVAL; + goto restore_opts; + } + } + + /* + * If we have an unprocessed orphan list hanging + * around from a previously readonly bdev mount, + * require a full umount/remount for now. + */ + if (es->s_last_orphan) { + ext4_msg(sb, KERN_WARNING, "Couldn't " + "remount RDWR because of unprocessed " + "orphan inode list. Please " + "umount/remount instead"); + err = -EINVAL; + goto restore_opts; + } + + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + if (sbi->s_journal) + ext4_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if (!ext4_setup_super(sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, + EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_multi_mount_protect(sb, + le64_to_cpu(es->s_mmp_block))) { + err = -EROFS; + goto restore_opts; + } + enable_quota = 1; + } + } + + /* + * Reinitialize lazy itable initialization thread based on + * current settings + */ + if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) + ext4_unregister_li_request(sb); + else { + ext4_group_t first_not_zeroed; + first_not_zeroed = ext4_has_uninit_itable(sb); + ext4_register_li_request(sb, first_not_zeroed); + } + + ext4_setup_system_zone(sb); + if (sbi->s_journal == NULL) + ext4_commit_super(sb, 1); + +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); +#endif + unlock_super(sb); + if (enable_quota) + dquot_resume(sb, -1); + + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + kfree(orig_data); + return 0; + +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_mount_opt2 = old_opts.s_mount_opt2; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + unlock_super(sb); + kfree(orig_data); + return err; +} + +/* + * Note: calculating the overhead so we can be compatible with + * historical BSD practice is quite difficult in the face of + * clusters/bigalloc. This is because multiple metadata blocks from + * different block group can end up in the same allocation cluster. + * Calculating the exact overhead in the face of clustered allocation + * requires either O(all block bitmaps) in memory or O(number of block + * groups**2) in time. We will still calculate the superblock for + * older file systems --- and if we come across with a bigalloc file + * system with zero in s_overhead_clusters the estimate will be close to + * correct especially for very large cluster sizes --- but for newer + * file systems, it's better to calculate this figure once at mkfs + * time, and store it in the superblock. If the superblock value is + * present (even for non-bigalloc file systems), we will use it. + */ +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct ext4_group_desc *gdp; + u64 fsid; + s64 bfree; + + if (test_opt(sb, MINIX_DF)) { + sbi->s_overhead_last = 0; + } else if (es->s_overhead_clusters) { + sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); + } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + ext4_fsblk_t overhead = 0; + + /* + * Compute the overhead (FS structures). This is constant + * for a given filesystem unless the number of block groups + * changes so we cache the previous value until it does. + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); + + /* + * Add the overhead found in each block group + */ + for (i = 0; i < ngroups; i++) { + gdp = ext4_get_group_desc(sb, i, NULL); + overhead += ext4_num_overhead_clusters(sb, i, gdp); + cond_resched(); + } + sbi->s_overhead_last = overhead; + smp_wmb(); + sbi->s_blocks_last = ext4_blocks_count(es); + } + + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = (ext4_blocks_count(es) - + EXT4_C2B(sbi, sbi->s_overhead_last)); + bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); + /* prevent underflow in case that few free space is available */ + buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); + buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); + if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); + buf->f_namelen = EXT4_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + + return 0; +} + +/* Helper function for writing quotas on sync - we need to start transaction + * before quota file is locked for write. Otherwise the are possible deadlocks: + * Process 1 Process 2 + * ext4_create() quota_sync() + * jbd2_journal_start() write_dquot() + * dquot_initialize() down(dqio_mutex) + * down(dqio_mutex) jbd2_journal_start() + * + */ + +#ifdef CONFIG_QUOTA + +static inline struct inode *dquot_to_inode(struct dquot *dquot) +{ + return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; +} + +static int ext4_write_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + struct inode *inode; + + inode = dquot_to_inode(dquot); + handle = ext4_journal_start(inode, + EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_acquire_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), + EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_acquire(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_release_dquot(struct dquot *dquot) +{ + int ret, err; + handle_t *handle; + + handle = ext4_journal_start(dquot_to_inode(dquot), + EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (IS_ERR(handle)) { + /* Release dquot anyway to avoid endless cycle in dqput() */ + dquot_release(dquot); + return PTR_ERR(handle); + } + ret = dquot_release(dquot); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +static int ext4_mark_dquot_dirty(struct dquot *dquot) +{ + /* Are we journaling quotas? */ + if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return ext4_write_dquot(dquot); + } else { + return dquot_mark_dquot_dirty(dquot); + } +} + +static int ext4_write_info(struct super_block *sb, int type) +{ + int ret, err; + handle_t *handle; + + /* Data block + inode block */ + handle = ext4_journal_start(sb->s_root->d_inode, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ret = dquot_commit_info(sb, type); + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + return ret; +} + +/* + * Turn on quotas during mount time - we need to find + * the quota file and such... + */ +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], + EXT4_SB(sb)->s_jquota_fmt, type); +} + +/* + * Standard function to be called on quota_on + */ +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + int err; + + if (!test_opt(sb, QUOTA)) + return -EINVAL; + + /* Quotafile not on the same filesystem? */ + if (path->dentry->d_sb != sb) + return -EXDEV; + /* Journaling quota? */ + if (EXT4_SB(sb)->s_qf_names[type]) { + /* Quotafile not in fs root? */ + if (path->dentry->d_parent != sb->s_root) + ext4_msg(sb, KERN_WARNING, + "Quota file not on filesystem root. " + "Journaled quota will not work"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (EXT4_SB(sb)->s_journal && + ext4_should_journal_data(path->dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err) + return err; + } + + return dquot_quota_on(sb, type, format_id, path); +} + +static int ext4_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + handle_t *handle; + + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) + sync_filesystem(sb); + + if (!inode) + goto out; + + /* Update modification times of quota files when userspace can + * start looking at them */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) + goto out; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + +out: + return dquot_quota_off(sb, type); +} + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + bh = ext4_bread(NULL, inode, blk, 0, &err); + if (err) + return err; + if (!bh) /* A hole? */ + memset(data, 0, tocopy); + else + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +static ssize_t ext4_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); + int err = 0; + int offset = off & (sb->s_blocksize - 1); + struct buffer_head *bh; + handle_t *handle = journal_current_handle(); + + if (EXT4_SB(sb)->s_journal && !handle) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + /* + * Since we account only one data block in transaction credits, + * then it is impossible to cross a block boundary. + */ + if (sb->s_blocksize - offset < len) { + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because not block aligned", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); + bh = ext4_bread(handle, inode, blk, 1, &err); + if (!bh) + goto out; + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, len); + flush_dcache_page(bh->b_page); + unlock_buffer(bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); + brelse(bh); +out: + if (err) { + mutex_unlock(&inode->i_mutex); + return err; + } + if (inode->i_size < off + len) { + i_size_write(inode, off + len); + EXT4_I(inode)->i_disksize = inode->i_size; + ext4_mark_inode_dirty(handle, inode); + } + mutex_unlock(&inode->i_mutex); + return len; +} + +#endif + +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); +} + +#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static inline void register_as_ext2(void) +{ + int err = register_filesystem(&ext2_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext2 (%d)\n", err); +} + +static inline void unregister_as_ext2(void) +{ + unregister_filesystem(&ext2_fs_type); +} + +static inline int ext2_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} +MODULE_ALIAS("ext2"); +#else +static inline void register_as_ext2(void) { } +static inline void unregister_as_ext2(void) { } +static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) +static inline void register_as_ext3(void) +{ + int err = register_filesystem(&ext3_fs_type); + if (err) + printk(KERN_WARNING + "EXT4-fs: Unable to register as ext3 (%d)\n", err); +} + +static inline void unregister_as_ext3(void) +{ + unregister_filesystem(&ext3_fs_type); +} + +static inline int ext3_feature_set_ok(struct super_block *sb) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + return 0; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + return 0; + if (sb->s_flags & MS_RDONLY) + return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + return 0; + return 1; +} +MODULE_ALIAS("ext3"); +#else +static inline void register_as_ext3(void) { } +static inline void unregister_as_ext3(void) { } +static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } +#endif + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .mount = ext4_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init ext4_init_feat_adverts(void) +{ + struct ext4_features *ef; + int ret = -ENOMEM; + + ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); + if (!ef) + goto out; + + ef->f_kobj.kset = ext4_kset; + init_completion(&ef->f_kobj_unregister); + ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, + "features"); + if (ret) { + kfree(ef); + goto out; + } + + ext4_feat = ef; + ret = 0; +out: + return ret; +} + +static void ext4_exit_feat_adverts(void) +{ + kobject_put(&ext4_feat->f_kobj); + wait_for_completion(&ext4_feat->f_kobj_unregister); + kfree(ext4_feat); +} + +/* Shared across all ext4 file systems */ +wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; +struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; + +static int __init ext4_init_fs(void) +{ + int i, err; + + ext4_li_info = NULL; + mutex_init(&ext4_li_mtx); + + ext4_check_flag_values(); + + for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { + mutex_init(&ext4__aio_mutex[i]); + init_waitqueue_head(&ext4__ioend_wq[i]); + } + + err = ext4_init_pageio(); + if (err) + return err; + err = ext4_init_system_zone(); + if (err) + goto out6; + ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); + if (!ext4_kset) + goto out5; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); + + err = ext4_init_feat_adverts(); + if (err) + goto out4; + + err = ext4_init_mballoc(); + if (err) + goto out3; + + err = ext4_init_xattr(); + if (err) + goto out2; + err = init_inodecache(); + if (err) + goto out1; + register_as_ext3(); + register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; +out: + unregister_as_ext2(); + unregister_as_ext3(); + destroy_inodecache(); +out1: + ext4_exit_xattr(); +out2: + ext4_exit_mballoc(); +out3: + ext4_exit_feat_adverts(); +out4: + if (ext4_proc_root) + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); +out5: + ext4_exit_system_zone(); +out6: + ext4_exit_pageio(); + return err; +} + +static void __exit ext4_exit_fs(void) +{ + ext4_destroy_lazyinit_thread(); + unregister_as_ext2(); + unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + destroy_inodecache(); + ext4_exit_xattr(); + ext4_exit_mballoc(); + ext4_exit_feat_adverts(); + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); + ext4_exit_system_zone(); + ext4_exit_pageio(); +} + +MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); +MODULE_LICENSE("GPL"); +module_init(ext4_init_fs) +module_exit(ext4_exit_fs) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c new file mode 100644 index 00000000..ed9354af --- /dev/null +++ b/fs/ext4/symlink.c @@ -0,0 +1,56 @@ +/* + * linux/fs/ext4/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext4 symlink handling code + */ + +#include +#include +#include +#include "ext4.h" +#include "xattr.h" + +static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); + nd_set_link(nd, (char *) ei->i_data); + return NULL; +} + +const struct inode_operations ext4_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif +}; + +const struct inode_operations ext4_fast_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext4_follow_link, + .setattr = ext4_setattr, +#ifdef CONFIG_EXT4_FS_XATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +#endif +}; diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 00000000..011ba667 --- /dev/null +++ b/fs/ext4/truncate.h @@ -0,0 +1,43 @@ +/* + * linux/fs/ext4/truncate.h + * + * Common inline functions needed for truncate support + */ + +/* + * Truncate blocks that were not used by write. We have to truncate the + * pagecache as well so that corresponding buffers get properly unmapped. + */ +static inline void ext4_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + ext4_truncate(inode); +} + +/* + * Work out how many blocks we need to proceed with the next chunk of a + * truncate transaction. + */ +static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) +{ + ext4_lblk_t needed; + + needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); + + /* Give ourselves just enough room to cope with inodes in which + * i_blocks is corrupt: we've seen disk corruptions in the past + * which resulted in random data in an inode which looked enough + * like a regular file for ext4 to try to delete it. Things + * will go a bit crazy if that happens, but at least we should + * try not to panic the whole kernel. */ + if (needed < 2) + needed = 2; + + /* But we need to bound the transaction so we don't overflow the + * journal. */ + if (needed > EXT4_MAX_TRANS_DATA) + needed = EXT4_MAX_TRANS_DATA; + + return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; +} + diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c new file mode 100644 index 00000000..e88748e5 --- /dev/null +++ b/fs/ext4/xattr.c @@ -0,0 +1,1608 @@ +/* + * linux/fs/ext4/xattr.c + * + * Copyright (C) 2001-2003 Andreas Gruenbacher, + * + * Fix by Harrison Xing . + * Ext4 code with a lot of help from Eric Jarman . + * Extended attributes for symlinks and special files added per + * suggestion of Luka Renko . + * xattr consolidation Copyright (c) 2004 James Morris , + * Red Hat Inc. + * ea-in-inode support by Alex Tomas aka bzzz + * and Andreas Gruenbacher . + */ + +/* + * Extended attributes are stored directly in inodes (on file systems with + * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl + * field contains the block number if an inode uses an additional block. All + * attributes must fit in the inode and one additional block. Blocks that + * contain the identical set of attributes may be shared among several inodes. + * Identical blocks are detected by keeping a cache of blocks that have + * recently been accessed. + * + * The attributes in inodes and on blocks have a different header; the entries + * are stored in the same format: + * + * +------------------+ + * | header | + * | entry 1 | | + * | entry 2 | | growing downwards + * | entry 3 | v + * | four null bytes | + * | . . . | + * | value 1 | ^ + * | value 3 | | growing upwards + * | value 2 | | + * +------------------+ + * + * The header is followed by multiple entry descriptors. In disk blocks, the + * entry descriptors are kept sorted. In inodes, they are unsorted. The + * attribute values are aligned to the end of the block in no specific order. + * + * Locking strategy + * ---------------- + * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. + * EA blocks are only changed if they are exclusive to an inode, so + * holding xattr_sem also means that nothing but the EA block's reference + * count can change. Multiple writers to the same block are synchronized + * by the buffer lock. + */ + +#include +#include +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "acl.h" + +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#ifdef EXT4_XATTR_DEBUG +# define ea_idebug(inode, f...) do { \ + printk(KERN_DEBUG "inode %s:%lu: ", \ + inode->i_sb->s_id, inode->i_ino); \ + printk(f); \ + printk("\n"); \ + } while (0) +# define ea_bdebug(bh, f...) do { \ + char b[BDEVNAME_SIZE]; \ + printk(KERN_DEBUG "block %s:%lu: ", \ + bdevname(bh->b_bdev, b), \ + (unsigned long) bh->b_blocknr); \ + printk(f); \ + printk("\n"); \ + } while (0) +#else +# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +static void ext4_xattr_cache_insert(struct buffer_head *); +static struct buffer_head *ext4_xattr_cache_find(struct inode *, + struct ext4_xattr_header *, + struct mb_cache_entry **); +static void ext4_xattr_rehash(struct ext4_xattr_header *, + struct ext4_xattr_entry *); +static int ext4_xattr_list(struct dentry *dentry, char *buffer, + size_t buffer_size); + +static struct mb_cache *ext4_xattr_cache; + +static const struct xattr_handler *ext4_xattr_handler_map[] = { + [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, +#endif + [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_SECURITY + [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, +#endif +}; + +const struct xattr_handler *ext4_xattr_handlers[] = { + &ext4_xattr_user_handler, + &ext4_xattr_trusted_handler, +#ifdef CONFIG_EXT4_FS_POSIX_ACL + &ext4_xattr_acl_access_handler, + &ext4_xattr_acl_default_handler, +#endif +#ifdef CONFIG_EXT4_FS_SECURITY + &ext4_xattr_security_handler, +#endif + NULL +}; + +static inline const struct xattr_handler * +ext4_xattr_handler(int name_index) +{ + const struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) + handler = ext4_xattr_handler_map[name_index]; + return handler; +} + +/* + * Inode operation listxattr() + * + * dentry->d_inode->i_mutex: don't care + */ +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + return ext4_xattr_list(dentry, buffer, size); +} + +static int +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) +{ + while (!IS_LAST_ENTRY(entry)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); + if ((void *)next >= end) + return -EIO; + entry = next; + } + return 0; +} + +static inline int +ext4_xattr_check_block(struct buffer_head *bh) +{ + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) + return -EIO; + return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); +} + +static inline int +ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) +{ + size_t value_size = le32_to_cpu(entry->e_value_size); + + if (entry->e_value_block != 0 || value_size > size || + le16_to_cpu(entry->e_value_offs) + value_size > size) + return -EIO; + return 0; +} + +static int +ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, + const char *name, size_t size, int sorted) +{ + struct ext4_xattr_entry *entry; + size_t name_len; + int cmp = 1; + + if (name == NULL) + return -EINVAL; + name_len = strlen(name); + entry = *pentry; + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + cmp = name_index - entry->e_name_index; + if (!cmp) + cmp = name_len - entry->e_name_len; + if (!cmp) + cmp = memcmp(name, entry->e_name, name_len); + if (cmp <= 0 && (sorted || cmp == 0)) + break; + } + *pentry = entry; + if (!cmp && ext4_xattr_check_entry(entry, size)) + return -EIO; + return cmp ? -ENODATA : 0; +} + +static int +ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct buffer_head *bh = NULL; + struct ext4_xattr_entry *entry; + size_t size; + int error; + + ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", + name_index, name, buffer, (long)buffer_size); + + error = -ENODATA; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(bh)) { +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(bh); + entry = BFIRST(bh); + error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), + size); + } + error = size; + +cleanup: + brelse(bh); + return error; +} + +static int +ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + size_t size; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return -ENODATA; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(entry, end); + if (error) + goto cleanup; + error = ext4_xattr_find_entry(&entry, name_index, name, + end - (void *)entry, 0); + if (error) + goto cleanup; + size = le32_to_cpu(entry->e_value_size); + if (buffer) { + error = -ERANGE; + if (size > buffer_size) + goto cleanup; + memcpy(buffer, (void *)IFIRST(header) + + le16_to_cpu(entry->e_value_offs), size); + } + error = size; + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_get() + * + * Copy an extended attribute into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t buffer_size) +{ + int error; + + down_read(&EXT4_I(inode)->xattr_sem); + error = ext4_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size); + if (error == -ENODATA) + error = ext4_xattr_block_get(inode, name_index, name, buffer, + buffer_size); + up_read(&EXT4_I(inode)->xattr_sem); + return error; +} + +static int +ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, + char *buffer, size_t buffer_size) +{ + size_t rest = buffer_size; + + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + const struct xattr_handler *handler = + ext4_xattr_handler(entry->e_name_index); + + if (handler) { + size_t size = handler->list(dentry, buffer, rest, + entry->e_name, + entry->e_name_len, + handler->flags); + if (buffer) { + if (size > rest) + return -ERANGE; + buffer += size; + } + rest -= size; + } + } + return buffer_size - rest; +} + +static int +ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct buffer_head *bh = NULL; + int error; + + ea_idebug(inode, "buffer=%p, buffer_size=%ld", + buffer, (long)buffer_size); + + error = 0; + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + ea_idebug(inode, "reading block %llu", + (unsigned long long)EXT4_I(inode)->i_file_acl); + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + ea_bdebug(bh, "b_count=%d, refcount=%d", + atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); + if (ext4_xattr_check_block(bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + ext4_xattr_cache_insert(bh); + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); + +cleanup: + brelse(bh); + + return error; +} + +static int +ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct inode *inode = dentry->d_inode; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + void *end; + int error; + + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return 0; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + error = ext4_xattr_check_names(IFIRST(header), end); + if (error) + goto cleanup; + error = ext4_xattr_list_entries(dentry, IFIRST(header), + buffer, buffer_size); + +cleanup: + brelse(iloc.bh); + return error; +} + +/* + * ext4_xattr_list() + * + * Copy a list of attribute names into the buffer + * provided, or compute the buffer size required. + * Buffer is NULL to compute the size of the buffer required. + * + * Returns a negative error number on failure, or the number of bytes + * used / required on success. + */ +static int +ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + int ret, ret2; + + down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + if (buffer) { + buffer += ret; + buffer_size -= ret; + } + ret = ext4_xattr_block_list(dentry, buffer, buffer_size); + if (ret < 0) + goto errout; + ret += ret2; +errout: + up_read(&EXT4_I(dentry->d_inode)->xattr_sem); + return ret; +} + +/* + * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is + * not set, set it. + */ +static void ext4_xattr_update_super_block(handle_t *handle, + struct super_block *sb) +{ + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) + return; + + if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); + ext4_handle_dirty_super(handle, sb); + } +} + +/* + * Release the xattr block BH: If the reference count is > 1, decrement + * it; otherwise free the block. + */ +static void +ext4_xattr_release_block(handle_t *handle, struct inode *inode, + struct buffer_head *bh) +{ + struct mb_cache_entry *ce = NULL; + int error = 0; + + ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bh); + if (error) + goto out; + + lock_buffer(bh); + if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { + ea_bdebug(bh, "refcount now=0; freeing"); + if (ce) + mb_cache_entry_free(ce); + get_bh(bh); + ext4_free_blocks(handle, inode, bh, 0, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + unlock_buffer(bh); + } else { + le32_add_cpu(&BHDR(bh)->h_refcount, -1); + if (ce) + mb_cache_entry_release(ce); + unlock_buffer(bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + dquot_free_block(inode, 1); + ea_bdebug(bh, "refcount now=%d; releasing", + le32_to_cpu(BHDR(bh)->h_refcount)); + } +out: + ext4_std_error(inode->i_sb, error); + return; +} + +/* + * Find the available free space for EAs. This also returns the total number of + * bytes used by EA entries. + */ +static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, + size_t *min_offs, void *base, int *total) +{ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + *total += EXT4_XATTR_LEN(last->e_name_len); + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < *min_offs) + *min_offs = offs; + } + } + return (*min_offs - ((void *)last - base) - sizeof(__u32)); +} + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +static int +ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) +{ + struct ext4_xattr_entry *last; + size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); + + /* Compute min_offs and last. */ + last = s->first; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + size_t offs = le16_to_cpu(last->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - ((void *)last - s->base) - sizeof(__u32); + if (!s->not_found) { + if (!s->here->e_value_block && s->here->e_value_size) { + size_t size = le32_to_cpu(s->here->e_value_size); + free += EXT4_XATTR_SIZE(size); + } + free += EXT4_XATTR_LEN(name_len); + } + if (i->value) { + if (free < EXT4_XATTR_SIZE(i->value_len) || + free < EXT4_XATTR_LEN(name_len) + + EXT4_XATTR_SIZE(i->value_len)) + return -ENOSPC; + } + + if (i->value && s->not_found) { + /* Insert the new name. */ + size_t size = EXT4_XATTR_LEN(name_len); + size_t rest = (void *)last - (void *)s->here + sizeof(__u32); + memmove((void *)s->here + size, s->here, rest); + memset(s->here, 0, size); + s->here->e_name_index = i->name_index; + s->here->e_name_len = name_len; + memcpy(s->here->e_name, i->name, name_len); + } else { + if (!s->here->e_value_block && s->here->e_value_size) { + void *first_val = s->base + min_offs; + size_t offs = le16_to_cpu(s->here->e_value_offs); + void *val = s->base + offs; + size_t size = EXT4_XATTR_SIZE( + le32_to_cpu(s->here->e_value_size)); + + if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { + /* The old and the new value have the same + size. Just replace. */ + s->here->e_value_size = + cpu_to_le32(i->value_len); + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear pad bytes. */ + memcpy(val, i->value, i->value_len); + return 0; + } + + /* Remove the old value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + s->here->e_value_size = 0; + s->here->e_value_offs = 0; + min_offs += size; + + /* Adjust all value offsets. */ + last = s->first; + while (!IS_LAST_ENTRY(last)) { + size_t o = le16_to_cpu(last->e_value_offs); + if (!last->e_value_block && + last->e_value_size && o < offs) + last->e_value_offs = + cpu_to_le16(o + size); + last = EXT4_XATTR_NEXT(last); + } + } + if (!i->value) { + /* Remove the old name. */ + size_t size = EXT4_XATTR_LEN(name_len); + last = ENTRY((void *)last - size); + memmove(s->here, (void *)s->here + size, + (void *)last - (void *)s->here + sizeof(__u32)); + memset(last, 0, size); + } + } + + if (i->value) { + /* Insert the new value. */ + s->here->e_value_size = cpu_to_le32(i->value_len); + if (i->value_len) { + size_t size = EXT4_XATTR_SIZE(i->value_len); + void *val = s->base + min_offs - size; + s->here->e_value_offs = cpu_to_le16(min_offs - size); + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); /* Clear the pad bytes. */ + memcpy(val, i->value, i->value_len); + } + } + return 0; +} + +struct ext4_xattr_block_find { + struct ext4_xattr_search s; + struct buffer_head *bh; +}; + +static int +ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + int error; + + ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", + i->name_index, i->name, i->value, (long)i->value_len); + + if (EXT4_I(inode)->i_file_acl) { + /* The inode already has an extended attribute block. */ + bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bs->bh) + goto cleanup; + ea_bdebug(bs->bh, "b_count=%d, refcount=%d", + atomic_read(&(bs->bh->b_count)), + le32_to_cpu(BHDR(bs->bh)->h_refcount)); + if (ext4_xattr_check_block(bs->bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + /* Find the named attribute. */ + bs->s.base = BHDR(bs->bh); + bs->s.first = BFIRST(bs->bh); + bs->s.end = bs->bh->b_data + bs->bh->b_size; + bs->s.here = bs->s.first; + error = ext4_xattr_find_entry(&bs->s.here, i->name_index, + i->name, bs->bh->b_size, 1); + if (error && error != -ENODATA) + goto cleanup; + bs->s.not_found = error; + } + error = 0; + +cleanup: + return error; +} + +static int +ext4_xattr_block_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_block_find *bs) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; + struct ext4_xattr_search *s = &bs->s; + struct mb_cache_entry *ce = NULL; + int error = 0; + +#define header(x) ((struct ext4_xattr_header *)(x)) + + if (i->value && i->value_len > sb->s_blocksize) + return -ENOSPC; + if (s->base) { + ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, + bs->bh->b_blocknr); + error = ext4_journal_get_write_access(handle, bs->bh); + if (error) + goto cleanup; + lock_buffer(bs->bh); + + if (header(s->base)->h_refcount == cpu_to_le32(1)) { + if (ce) { + mb_cache_entry_free(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s); + if (!error) { + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), + s->here); + ext4_xattr_cache_insert(bs->bh); + } + unlock_buffer(bs->bh); + if (error == -EIO) + goto bad_block; + if (!error) + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); + if (error) + goto cleanup; + goto inserted; + } else { + int offset = (char *)s->here - bs->bh->b_data; + + unlock_buffer(bs->bh); + ext4_handle_release_buffer(handle, bs->bh); + if (ce) { + mb_cache_entry_release(ce); + ce = NULL; + } + ea_bdebug(bs->bh, "cloning"); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); + s->end = s->base + bs->bh->b_size; + } + } else { + /* Allocate a buffer where we construct the new block. */ + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); + /* assert(header == s->base) */ + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; + header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + header(s->base)->h_blocks = cpu_to_le32(1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->first = ENTRY(header(s->base)+1); + s->here = ENTRY(header(s->base)+1); + s->end = s->base + sb->s_blocksize; + } + + error = ext4_xattr_set_entry(i, s); + if (error == -EIO) + goto bad_block; + if (error) + goto cleanup; + if (!IS_LAST_ENTRY(s->first)) + ext4_xattr_rehash(header(s->base), s->here); + +inserted: + if (!IS_LAST_ENTRY(s->first)) { + new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); + if (new_bh) { + /* We found an identical block in the cache. */ + if (new_bh == bs->bh) + ea_bdebug(new_bh, "keeping"); + else { + /* The old block is released after updating + the inode. */ + error = dquot_alloc_block(inode, 1); + if (error) + goto cleanup; + error = ext4_journal_get_write_access(handle, + new_bh); + if (error) + goto cleanup_dquot; + lock_buffer(new_bh); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); + ea_bdebug(new_bh, "reusing; refcount now=%d", + le32_to_cpu(BHDR(new_bh)->h_refcount)); + unlock_buffer(new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); + if (error) + goto cleanup_dquot; + } + mb_cache_entry_release(ce); + ce = NULL; + } else if (bs->bh && s->base == bs->bh->b_data) { + /* We were modifying this block in-place. */ + ea_bdebug(bs->bh, "keeping this block"); + new_bh = bs->bh; + get_bh(new_bh); + } else { + /* We need to allocate a new block */ + ext4_fsblk_t goal, block; + + goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + /* + * take i_data_sem because we will test + * i_delalloc_reserved_flag in ext4_mb_new_blocks + */ + down_read((&EXT4_I(inode)->i_data_sem)); + block = ext4_new_meta_blocks(handle, inode, goal, 0, + NULL, &error); + up_read((&EXT4_I(inode)->i_data_sem)); + if (error) + goto cleanup; + + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + + ea_idebug(inode, "creating block %llu", + (unsigned long long)block); + + new_bh = sb_getblk(sb, block); + if (!new_bh) { +getblk_failed: + ext4_free_blocks(handle, inode, NULL, block, 1, + EXT4_FREE_BLOCKS_METADATA); + error = -EIO; + goto cleanup; + } + lock_buffer(new_bh); + error = ext4_journal_get_create_access(handle, new_bh); + if (error) { + unlock_buffer(new_bh); + goto getblk_failed; + } + memcpy(new_bh->b_data, s->base, new_bh->b_size); + set_buffer_uptodate(new_bh); + unlock_buffer(new_bh); + ext4_xattr_cache_insert(new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); + if (error) + goto cleanup; + } + } + + /* Update the inode. */ + EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; + + /* Drop the previous xattr block. */ + if (bs->bh && bs->bh != new_bh) + ext4_xattr_release_block(handle, inode, bs->bh); + error = 0; + +cleanup: + if (ce) + mb_cache_entry_release(ce); + brelse(new_bh); + if (!(bs->bh && s->base == bs->bh->b_data)) + kfree(s->base); + + return error; + +cleanup_dquot: + dquot_free_block(inode, 1); + goto cleanup; + +bad_block: + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + +#undef header +} + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; + +static int +ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + raw_inode = ext4_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + error = ext4_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; + /* Find the named attribute. */ + error = ext4_xattr_find_entry(&is->s.here, i->name_index, + i->name, is->s.end - + (void *)is->s.base, 0); + if (error && error != -ENODATA) + return error; + is->s.not_found = error; + } + return 0; +} + +static int +ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) + return error; + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +/* + * ext4_xattr_set_handle() + * + * Create, replace or remove an extended attribute for this inode. Value + * is NULL to remove an existing extended attribute, and non-NULL to + * either replace an existing extended attribute, or create a new extended + * attribute. The flags XATTR_REPLACE and XATTR_CREATE + * specify that an extended attribute must exist and must not exist + * previous to the call, respectively. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t value_len, + int flags) +{ + struct ext4_xattr_info i = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + + }; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_block_find bs = { + .s = { .not_found = -ENODATA, }, + }; + unsigned long no_expand; + int error; + + if (!name) + return -EINVAL; + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT4_I(inode)->xattr_sem); + no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); + ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + + error = ext4_reserve_inode_write(handle, inode, &is.iloc); + if (error) + goto cleanup; + + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + ext4_clear_inode_state(inode, EXT4_STATE_NEW); + } + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto cleanup; + if (is.s.not_found) + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + if (is.s.not_found && bs.s.not_found) { + error = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + error = 0; + if (!value) + goto cleanup; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + if (!value) { + if (!is.s.not_found) + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + else if (!bs.s.not_found) + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else { + error = ext4_xattr_ibody_set(handle, inode, &i, &is); + if (!error && !bs.s.not_found) { + i.value = NULL; + error = ext4_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { + if (EXT4_I(inode)->i_file_acl && !bs.s.base) { + error = ext4_xattr_block_find(inode, &i, &bs); + if (error) + goto cleanup; + } + error = ext4_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; + if (!is.s.not_found) { + i.value = NULL; + error = ext4_xattr_ibody_set(handle, inode, &i, + &is); + } + } + } + if (!error) { + ext4_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = ext4_current_time(inode); + if (!value) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext4_mark_iloc_dirty, even with + * error != 0. + */ + is.iloc.bh = NULL; + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + } + +cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + if (no_expand == 0) + ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + +/* + * ext4_xattr_set() + * + * Like ext4_xattr_set_handle, but start from an inode. This extended + * attribute modification is a filesystem transaction by itself. + * + * Returns 0, or a negative error number on failure. + */ +int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t value_len, int flags) +{ + handle_t *handle; + int error, retries = 0; + +retry: + handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + } else { + int error2; + + error = ext4_xattr_set_handle(handle, inode, name_index, name, + value, value_len, flags); + error2 = ext4_journal_stop(handle); + if (error == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (error == 0) + error = error2; + } + + return error; +} + +/* + * Shift the EA entries in the inode to create space for the increased + * i_extra_isize. + */ +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, + int value_offs_shift, void *to, + void *from, size_t n, int blocksize) +{ + struct ext4_xattr_entry *last = entry; + int new_offs; + + /* Adjust the value offsets of the entries */ + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + if (!last->e_value_block && last->e_value_size) { + new_offs = le16_to_cpu(last->e_value_offs) + + value_offs_shift; + BUG_ON(new_offs + le32_to_cpu(last->e_value_size) + > blocksize); + last->e_value_offs = cpu_to_le16(new_offs); + } + } + /* Shift the entries by n bytes */ + memmove(to, from, n); +} + +/* + * Expand an inode by new_extra_isize bytes when EAs are present. + * Returns 0 on success or negative error number on failure. + */ +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry, *last, *first; + struct buffer_head *bh = NULL; + struct ext4_xattr_ibody_find *is = NULL; + struct ext4_xattr_block_find *bs = NULL; + char *buffer = NULL, *b_entry_name = NULL; + size_t min_offs, free; + int total_ino, total_blk; + void *base, *start, *end; + int extra_isize = 0, error = 0, tried_min_extra_isize = 0; + int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); + + down_write(&EXT4_I(inode)->xattr_sem); +retry: + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + } + + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* + * Check if enough free space is available in the inode to shift the + * entries ahead by new_extra_isize. + */ + + base = start = entry; + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + min_offs = end - base; + last = entry; + total_ino = sizeof(struct ext4_xattr_ibody_header); + + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); + if (free >= new_extra_isize) { + entry = IFIRST(header); + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize + - new_extra_isize, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, + (void *)header, total_ino, + inode->i_sb->s_blocksize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; + error = 0; + goto cleanup; + } + + /* + * Enough free space isn't available in the inode, check if + * EA block can hold new_extra_isize bytes. + */ + if (EXT4_I(inode)->i_file_acl) { + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + error = -EIO; + if (!bh) + goto cleanup; + if (ext4_xattr_check_block(bh)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + error = -EIO; + goto cleanup; + } + base = BHDR(bh); + first = BFIRST(bh); + end = bh->b_data + bh->b_size; + min_offs = end - base; + free = ext4_xattr_free_space(first, &min_offs, base, + &total_blk); + if (free < new_extra_isize) { + if (!tried_min_extra_isize && s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + brelse(bh); + goto retry; + } + error = -1; + goto cleanup; + } + } else { + free = inode->i_sb->s_blocksize; + } + + while (new_extra_isize > 0) { + size_t offs, size, entry_size; + struct ext4_xattr_entry *small_entry = NULL; + struct ext4_xattr_info i = { + .value = NULL, + .value_len = 0, + }; + unsigned int total_size; /* EA entry size + value size */ + unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ + unsigned int min_total_size = ~0U; + + is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); + bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); + if (!is || !bs) { + error = -ENOMEM; + goto cleanup; + } + + is->s.not_found = -ENODATA; + bs->s.not_found = -ENODATA; + is->iloc.bh = NULL; + bs->bh = NULL; + + last = IFIRST(header); + /* Find the entry best suited to be pushed into EA block */ + entry = NULL; + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { + total_size = + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + + EXT4_XATTR_LEN(last->e_name_len); + if (total_size <= free && total_size < min_total_size) { + if (total_size < new_extra_isize) { + small_entry = last; + } else { + entry = last; + min_total_size = total_size; + } + } + } + + if (entry == NULL) { + if (small_entry) { + entry = small_entry; + } else { + if (!tried_min_extra_isize && + s_min_extra_isize) { + tried_min_extra_isize++; + new_extra_isize = s_min_extra_isize; + goto retry; + } + error = -1; + goto cleanup; + } + } + offs = le16_to_cpu(entry->e_value_offs); + size = le32_to_cpu(entry->e_value_size); + entry_size = EXT4_XATTR_LEN(entry->e_name_len); + i.name_index = entry->e_name_index, + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); + if (!buffer || !b_entry_name) { + error = -ENOMEM; + goto cleanup; + } + /* Save the entry name and the entry value */ + memcpy(buffer, (void *)IFIRST(header) + offs, + EXT4_XATTR_SIZE(size)); + memcpy(b_entry_name, entry->e_name, entry->e_name_len); + b_entry_name[entry->e_name_len] = '\0'; + i.name = b_entry_name; + + error = ext4_get_inode_loc(inode, &is->iloc); + if (error) + goto cleanup; + + error = ext4_xattr_ibody_find(inode, &i, is); + if (error) + goto cleanup; + + /* Remove the chosen entry from the inode */ + error = ext4_xattr_ibody_set(handle, inode, &i, is); + if (error) + goto cleanup; + + entry = IFIRST(header); + if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) + shift_bytes = new_extra_isize; + else + shift_bytes = entry_size + size; + /* Adjust the offsets and shift the remaining entries ahead */ + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - + shift_bytes, (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, + (void *)header, total_ino - entry_size, + inode->i_sb->s_blocksize); + + extra_isize += shift_bytes; + new_extra_isize -= shift_bytes; + EXT4_I(inode)->i_extra_isize = extra_isize; + + i.name = b_entry_name; + i.value = buffer; + i.value_len = size; + error = ext4_xattr_block_find(inode, &i, bs); + if (error) + goto cleanup; + + /* Add entry which was removed from the inode into the block */ + error = ext4_xattr_block_set(handle, inode, &i, bs); + if (error) + goto cleanup; + kfree(b_entry_name); + kfree(buffer); + b_entry_name = NULL; + buffer = NULL; + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + } + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return 0; + +cleanup: + kfree(b_entry_name); + kfree(buffer); + if (is) + brelse(is->iloc.bh); + kfree(is); + kfree(bs); + brelse(bh); + up_write(&EXT4_I(inode)->xattr_sem); + return error; +} + + + +/* + * ext4_xattr_delete_inode() + * + * Free extended attribute resources associated with this inode. This + * is called immediately before an inode is freed. We have exclusive + * access to the inode. + */ +void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ + struct buffer_head *bh = NULL; + + if (!EXT4_I(inode)->i_file_acl) + goto cleanup; + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || + BHDR(bh)->h_blocks != cpu_to_le32(1)) { + EXT4_ERROR_INODE(inode, "bad block %llu", + EXT4_I(inode)->i_file_acl); + goto cleanup; + } + ext4_xattr_release_block(handle, inode, bh); + EXT4_I(inode)->i_file_acl = 0; + +cleanup: + brelse(bh); +} + +/* + * ext4_xattr_put_super() + * + * This is called when a file system is unmounted. + */ +void +ext4_xattr_put_super(struct super_block *sb) +{ + mb_cache_shrink(sb->s_bdev); +} + +/* + * ext4_xattr_cache_insert() + * + * Create a new entry in the extended attribute cache, and insert + * it unless such an entry is already in the cache. + * + * Returns 0, or a negative error number on failure. + */ +static void +ext4_xattr_cache_insert(struct buffer_head *bh) +{ + __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); + struct mb_cache_entry *ce; + int error; + + ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); + if (!ce) { + ea_bdebug(bh, "out of memory"); + return; + } + error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); + if (error) { + mb_cache_entry_free(ce); + if (error == -EBUSY) { + ea_bdebug(bh, "already in cache"); + error = 0; + } + } else { + ea_bdebug(bh, "inserting [%x]", (int)hash); + mb_cache_entry_release(ce); + } +} + +/* + * ext4_xattr_cmp() + * + * Compare two extended attribute blocks for equality. + * + * Returns 0 if the blocks are equal, 1 if they differ, and + * a negative error number on errors. + */ +static int +ext4_xattr_cmp(struct ext4_xattr_header *header1, + struct ext4_xattr_header *header2) +{ + struct ext4_xattr_entry *entry1, *entry2; + + entry1 = ENTRY(header1+1); + entry2 = ENTRY(header2+1); + while (!IS_LAST_ENTRY(entry1)) { + if (IS_LAST_ENTRY(entry2)) + return 1; + if (entry1->e_hash != entry2->e_hash || + entry1->e_name_index != entry2->e_name_index || + entry1->e_name_len != entry2->e_name_len || + entry1->e_value_size != entry2->e_value_size || + memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) + return 1; + if (entry1->e_value_block != 0 || entry2->e_value_block != 0) + return -EIO; + if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), + (char *)header2 + le16_to_cpu(entry2->e_value_offs), + le32_to_cpu(entry1->e_value_size))) + return 1; + + entry1 = EXT4_XATTR_NEXT(entry1); + entry2 = EXT4_XATTR_NEXT(entry2); + } + if (!IS_LAST_ENTRY(entry2)) + return 1; + return 0; +} + +/* + * ext4_xattr_cache_find() + * + * Find an identical extended attribute block. + * + * Returns a pointer to the block found, or NULL if such a block was + * not found or an error occurred. + */ +static struct buffer_head * +ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, + struct mb_cache_entry **pce) +{ + __u32 hash = le32_to_cpu(header->h_hash); + struct mb_cache_entry *ce; + + if (!header->h_hash) + return NULL; /* never share */ + ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); +again: + ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, + hash); + while (ce) { + struct buffer_head *bh; + + if (IS_ERR(ce)) { + if (PTR_ERR(ce) == -EAGAIN) + goto again; + break; + } + bh = sb_bread(inode->i_sb, ce->e_block); + if (!bh) { + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long) ce->e_block); + } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= + EXT4_XATTR_REFCOUNT_MAX) { + ea_idebug(inode, "block %lu refcount %d>=%d", + (unsigned long) ce->e_block, + le32_to_cpu(BHDR(bh)->h_refcount), + EXT4_XATTR_REFCOUNT_MAX); + } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { + *pce = ce; + return bh; + } + brelse(bh); + ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); + } + return NULL; +} + +#define NAME_HASH_SHIFT 5 +#define VALUE_HASH_SHIFT 16 + +/* + * ext4_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + __u32 hash = 0; + char *name = entry->e_name; + int n; + + for (n = 0; n < entry->e_name_len; n++) { + hash = (hash << NAME_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ + *name++; + } + + if (entry->e_value_block == 0 && entry->e_value_size != 0) { + __le32 *value = (__le32 *)((char *)header + + le16_to_cpu(entry->e_value_offs)); + for (n = (le32_to_cpu(entry->e_value_size) + + EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { + hash = (hash << VALUE_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ + le32_to_cpu(*value++); + } + } + entry->e_hash = cpu_to_le32(hash); +} + +#undef NAME_HASH_SHIFT +#undef VALUE_HASH_SHIFT + +#define BLOCK_HASH_SHIFT 16 + +/* + * ext4_xattr_rehash() + * + * Re-compute the extended attribute hash value after an entry has changed. + */ +static void ext4_xattr_rehash(struct ext4_xattr_header *header, + struct ext4_xattr_entry *entry) +{ + struct ext4_xattr_entry *here; + __u32 hash = 0; + + ext4_xattr_hash_entry(header, entry); + here = ENTRY(header+1); + while (!IS_LAST_ENTRY(here)) { + if (!here->e_hash) { + /* Block is not shared if an entry's hash value == 0 */ + hash = 0; + break; + } + hash = (hash << BLOCK_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ + le32_to_cpu(here->e_hash); + here = EXT4_XATTR_NEXT(here); + } + header->h_hash = cpu_to_le32(hash); +} + +#undef BLOCK_HASH_SHIFT + +int __init +ext4_init_xattr(void) +{ + ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); + if (!ext4_xattr_cache) + return -ENOMEM; + return 0; +} + +void +ext4_exit_xattr(void) +{ + if (ext4_xattr_cache) + mb_cache_destroy(ext4_xattr_cache); + ext4_xattr_cache = NULL; +} diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h new file mode 100644 index 00000000..25b7387f --- /dev/null +++ b/fs/ext4/xattr.h @@ -0,0 +1,155 @@ +/* + File: fs/ext4/xattr.h + + On-disk format of extended attributes for the ext4 filesystem. + + (C) 2001 Andreas Gruenbacher, +*/ + +#include + +/* Magic value in attribute blocks */ +#define EXT4_XATTR_MAGIC 0xEA020000 + +/* Maximum number of references to one attribute block */ +#define EXT4_XATTR_REFCOUNT_MAX 1024 + +/* Name indexes */ +#define EXT4_XATTR_INDEX_USER 1 +#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EXT4_XATTR_INDEX_TRUSTED 4 +#define EXT4_XATTR_INDEX_LUSTRE 5 +#define EXT4_XATTR_INDEX_SECURITY 6 + +struct ext4_xattr_header { + __le32 h_magic; /* magic number for identification */ + __le32 h_refcount; /* reference count */ + __le32 h_blocks; /* number of disk blocks used */ + __le32 h_hash; /* hash value of all attributes */ + __u32 h_reserved[4]; /* zero right now */ +}; + +struct ext4_xattr_ibody_header { + __le32 h_magic; /* magic number for identification */ +}; + +struct ext4_xattr_entry { + __u8 e_name_len; /* length of name */ + __u8 e_name_index; /* attribute name index */ + __le16 e_value_offs; /* offset in disk block of value */ + __le32 e_value_block; /* disk block attribute is stored on (n/i) */ + __le32 e_value_size; /* size of attribute value */ + __le32 e_hash; /* hash value of name and value */ + char e_name[0]; /* attribute name */ +}; + +#define EXT4_XATTR_PAD_BITS 2 +#define EXT4_XATTR_PAD (1<e_name_len))) +#define EXT4_XATTR_SIZE(size) \ + (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) + +#define IHDR(inode, raw_inode) \ + ((struct ext4_xattr_ibody_header *) \ + ((void *)raw_inode + \ + EXT4_GOOD_OLD_INODE_SIZE + \ + EXT4_I(inode)->i_extra_isize)) +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) + +# ifdef CONFIG_EXT4_FS_XATTR + +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_acl_access_handler; +extern const struct xattr_handler ext4_xattr_acl_default_handler; +extern const struct xattr_handler ext4_xattr_security_handler; + +extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); + +extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); + +extern void ext4_xattr_delete_inode(handle_t *, struct inode *); +extern void ext4_xattr_put_super(struct super_block *); + +extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); + +extern int __init ext4_init_xattr(void); +extern void ext4_exit_xattr(void); + +extern const struct xattr_handler *ext4_xattr_handlers[]; + +# else /* CONFIG_EXT4_FS_XATTR */ + +static inline int +ext4_xattr_get(struct inode *inode, int name_index, const char *name, + void *buffer, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set(struct inode *inode, int name_index, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline int +ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + const char *name, const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static inline void +ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) +{ +} + +static inline void +ext4_xattr_put_super(struct super_block *sb) +{ +} + +static __init inline int +ext4_init_xattr(void) +{ + return 0; +} + +static inline void +ext4_exit_xattr(void) +{ +} + +static inline int +ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle) +{ + return -EOPNOTSUPP; +} + +#define ext4_xattr_handlers NULL + +# endif /* CONFIG_EXT4_FS_XATTR */ + +#ifdef CONFIG_EXT4_FS_SECURITY +extern int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr); +#else +static inline int ext4_init_security(handle_t *handle, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c new file mode 100644 index 00000000..d2a20062 --- /dev/null +++ b/fs/ext4/xattr_security.c @@ -0,0 +1,82 @@ +/* + * linux/fs/ext4/xattr_security.c + * Handler for storing security labels as extended attributes. + */ + +#include +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; + const size_t total_len = prefix_len + name_len + 1; + + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); +} + +static int +ext4_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); +} + +static int +ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + handle_t *handle = fs_info; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, 0); + if (err < 0) + break; + } + return err; +} + +int +ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &ext4_initxattrs, handle); +} + +const struct xattr_handler ext4_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ext4_xattr_security_list, + .get = ext4_xattr_security_get, + .set = ext4_xattr_security_set, +}; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c new file mode 100644 index 00000000..95f1f4ab --- /dev/null +++ b/fs/ext4/xattr_trusted.c @@ -0,0 +1,58 @@ +/* + * linux/fs/ext4/xattr_trusted.c + * Handler for trusted extended attributes. + * + * Copyright (C) 2003 by Andreas Gruenbacher, + */ + +#include +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!capable(CAP_SYS_ADMIN)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, + size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, buffer, size); +} + +static int +ext4_xattr_trusted_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ext4_xattr_trusted_list, + .get = ext4_xattr_trusted_get, + .set = ext4_xattr_trusted_set, +}; diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c new file mode 100644 index 00000000..0edb7611 --- /dev/null +++ b/fs/ext4/xattr_user.c @@ -0,0 +1,61 @@ +/* + * linux/fs/ext4/xattr_user.c + * Handler for extended user attributes. + * + * Copyright (C) 2001 by Andreas Gruenbacher, + */ + +#include +#include +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" + +static size_t +ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const size_t prefix_len = XATTR_USER_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (!test_opt(dentry->d_sb, XATTR_USER)) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list+prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int +ext4_xattr_user_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, buffer, size); +} + +static int +ext4_xattr_user_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + if (!test_opt(dentry->d_sb, XATTR_USER)) + return -EOPNOTSUPP; + return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ext4_xattr_user_list, + .get = ext4_xattr_user_get, + .set = ext4_xattr_user_set, +}; -- cgit