summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig85
-rw-r--r--fs/ext4/Makefile14
-rw-r--r--fs/ext4/acl.c439
-rw-r--r--fs/ext4/acl.h77
-rw-r--r--fs/ext4/balloc.c766
-rw-r--r--fs/ext4/bitmap.c31
-rw-r--r--fs/ext4/block_validity.c268
-rw-r--r--fs/ext4/dir.c667
-rw-r--r--fs/ext4/ext4.h2372
-rw-r--r--fs/ext4/ext4_extents.h296
-rw-r--r--fs/ext4/ext4_jbd2.c154
-rw-r--r--fs/ext4/ext4_jbd2.h399
-rw-r--r--fs/ext4/extents.c4866
-rw-r--r--fs/ext4/file.c262
-rw-r--r--fs/ext4/fsync.c271
-rw-r--r--fs/ext4/hash.c208
-rw-r--r--fs/ext4/ialloc.c1161
-rw-r--r--fs/ext4/indirect.c1502
-rw-r--r--fs/ext4/inode.c4676
-rw-r--r--fs/ext4/ioctl.c509
-rw-r--r--fs/ext4/mballoc.c5047
-rw-r--r--fs/ext4/mballoc.h222
-rw-r--r--fs/ext4/migrate.c604
-rw-r--r--fs/ext4/mmp.c353
-rw-r--r--fs/ext4/move_extent.c1423
-rw-r--r--fs/ext4/namei.c2607
-rw-r--r--fs/ext4/page-io.c433
-rw-r--r--fs/ext4/resize.c1689
-rw-r--r--fs/ext4/super.c4980
-rw-r--r--fs/ext4/symlink.c56
-rw-r--r--fs/ext4/truncate.h43
-rw-r--r--fs/ext4/xattr.c1608
-rw-r--r--fs/ext4/xattr.h155
-rw-r--r--fs/ext4/xattr_security.c82
-rw-r--r--fs/ext4/xattr_trusted.c58
-rw-r--r--fs/ext4/xattr_user.c61
36 files changed, 38444 insertions, 0 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 00000000..9ed1bb1f
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,85 @@
+config EXT4_FS
+ tristate "The Extended 4 (ext4) filesystem"
+ select JBD2
+ select CRC16
+ help
+ This is the next generation of the ext3 filesystem.
+
+ Unlike the change from ext2 filesystem to ext3 filesystem,
+ the on-disk format of ext4 is not forwards compatible with
+ ext3; it is based on extent maps and it supports 48-bit
+ physical block numbers. The ext4 filesystem also supports delayed
+ allocation, persistent preallocation, high resolution time stamps,
+ and a number of other features to improve performance and speed
+ up fsck time. For more information, please see the web pages at
+ http://ext4.wiki.kernel.org.
+
+ The ext4 filesystem will support mounting an ext3
+ filesystem; while there will be some performance gains from
+ the delayed allocation and inode table readahead, the best
+ performance gains will require enabling ext4 features in the
+ filesystem, or formatting a new filesystem as an ext4
+ filesystem initially.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called ext4.
+
+ If unsure, say N.
+
+config EXT4_USE_FOR_EXT23
+ bool "Use ext4 for ext2/ext3 file systems"
+ depends on EXT4_FS
+ depends on EXT3_FS=n || EXT2_FS=n
+ default y
+ help
+ Allow the ext4 file system driver code to be used for ext2 or
+ ext3 file system mounts. This allows users to reduce their
+ compiled kernel size by using one file system driver for
+ ext2, ext3, and ext4 file systems.
+
+config EXT4_FS_XATTR
+ bool "Ext4 extended attributes"
+ depends on EXT4_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+ bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+ bool "Ext4 Security Labels"
+ depends on EXT4_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext4 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+ bool "EXT4 debugging support"
+ depends on EXT4_FS
+ help
+ Enables run-time debugging support for the ext4 filesystem.
+
+ If you select Y here, then you will be able to turn on debugging
+ with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 00000000..56fd8f86
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the linux ext4-filesystem routines.
+#
+
+obj-$(CONFIG_EXT4_FS) += ext4.o
+
+ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o indirect.o
+
+ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 00000000..a5c29bb3
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,439 @@
+/*
+ * linux/fs/ext4/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext4_acl_from_disk(const void *value, size_t size)
+{
+ const char *end = (char *)value + size;
+ int n, count;
+ struct posix_acl *acl;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(ext4_acl_header))
+ return ERR_PTR(-EINVAL);
+ if (((ext4_acl_header *)value)->a_version !=
+ cpu_to_le32(EXT4_ACL_VERSION))
+ return ERR_PTR(-EINVAL);
+ value = (char *)value + sizeof(ext4_acl_header);
+ count = ext4_acl_count(size);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+ acl = posix_acl_alloc(count, GFP_NOFS);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ for (n = 0; n < count; n++) {
+ ext4_acl_entry *entry =
+ (ext4_acl_entry *)value;
+ if ((char *)value + sizeof(ext4_acl_entry_short) > end)
+ goto fail;
+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+
+ switch (acl->a_entries[n].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ value = (char *)value +
+ sizeof(ext4_acl_entry_short);
+ acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_id =
+ le32_to_cpu(entry->e_id);
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ if (value != end)
+ goto fail;
+ return acl;
+
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+ ext4_acl_header *ext_acl;
+ char *e;
+ size_t n;
+
+ *size = ext4_acl_size(acl->a_count);
+ ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
+ sizeof(ext4_acl_entry), GFP_NOFS);
+ if (!ext_acl)
+ return ERR_PTR(-ENOMEM);
+ ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
+ e = (char *)ext_acl + sizeof(ext4_acl_header);
+ for (n = 0; n < acl->a_count; n++) {
+ ext4_acl_entry *entry = (ext4_acl_entry *)e;
+ entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+ switch (acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ e += sizeof(ext4_acl_entry);
+ break;
+
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ e += sizeof(ext4_acl_entry_short);
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ return (char *)ext_acl;
+
+fail:
+ kfree(ext_acl);
+ return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+struct posix_acl *
+ext4_get_acl(struct inode *inode, int type)
+{
+ int name_index;
+ char *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return NULL;
+
+ acl = get_cached_acl(inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+ retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = ext4_xattr_get(inode, name_index, "", value, retval);
+ }
+ if (retval > 0)
+ acl = ext4_acl_from_disk(value, retval);
+ else if (retval == -ENODATA || retval == -ENOSYS)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext4_new_inode
+ */
+static int
+ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+ struct posix_acl *acl)
+{
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int error;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (error < 0)
+ return error;
+ else {
+ inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ if (error == 0)
+ acl = NULL;
+ }
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (acl) {
+ value = ext4_acl_to_disk(acl, &size);
+ if (IS_ERR(value))
+ return (int)PTR_ERR(value);
+ }
+
+ error = ext4_xattr_set_handle(handle, inode, name_index, "",
+ value, size, 0);
+
+ kfree(value);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+
+ return error;
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ int error = 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (test_opt(dir->i_sb, POSIX_ACL)) {
+ acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl)
+ inode->i_mode &= ~current_umask();
+ }
+ if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+ if (S_ISDIR(inode->i_mode)) {
+ error = ext4_set_acl(handle, inode,
+ ACL_TYPE_DEFAULT, acl);
+ if (error)
+ goto cleanup;
+ }
+ error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+ if (error < 0)
+ return error;
+
+ if (error > 0) {
+ /* This is an extended ACL */
+ error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+ }
+ }
+cleanup:
+ posix_acl_release(acl);
+ return error;
+}
+
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext4_acl_chmod(struct inode *inode)
+{
+ struct posix_acl *acl;
+ handle_t *handle;
+ int retries = 0;
+ int error;
+
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return 0;
+ acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ if (error)
+ return error;
+retry:
+ handle = ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ ext4_std_error(inode->i_sb, error);
+ goto out;
+ }
+ error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+ ext4_journal_stop(handle);
+ if (error == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ posix_acl_release(acl);
+ return error;
+}
+
+/*
+ * Extended attribute handlers
+ */
+static size_t
+ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
+ const char *name, size_t name_len, int type)
+{
+ const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+ if (!test_opt(dentry->d_sb, POSIX_ACL))
+ return 0;
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+ return size;
+}
+
+static size_t
+ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
+ const char *name, size_t name_len, int type)
+{
+ const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+ if (!test_opt(dentry->d_sb, POSIX_ACL))
+ return 0;
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+ return size;
+}
+
+static int
+ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+ size_t size, int type)
+{
+ struct posix_acl *acl;
+ int error;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(dentry->d_sb, POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acl = ext4_get_acl(dentry->d_inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+ error = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int
+ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags, int type)
+{
+ struct inode *inode = dentry->d_inode;
+ handle_t *handle;
+ struct posix_acl *acl;
+ int error, retries = 0;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return -EOPNOTSUPP;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ else if (acl) {
+ error = posix_acl_valid(acl);
+ if (error)
+ goto release_and_out;
+ }
+ } else
+ acl = NULL;
+
+retry:
+ handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ error = ext4_set_acl(handle, inode, type, acl);
+ ext4_journal_stop(handle);
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+release_and_out:
+ posix_acl_release(acl);
+ return error;
+}
+
+const struct xattr_handler ext4_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .list = ext4_xattr_list_acl_access,
+ .get = ext4_xattr_get_acl,
+ .set = ext4_xattr_set_acl,
+};
+
+const struct xattr_handler ext4_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .list = ext4_xattr_list_acl_default,
+ .get = ext4_xattr_get_acl,
+ .set = ext4_xattr_set_acl,
+};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 00000000..18cb39ed
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,77 @@
+/*
+ File: fs/ext4/acl.h
+
+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT4_ACL_VERSION 0x0001
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+} ext4_acl_entry;
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+} ext4_acl_entry_short;
+
+typedef struct {
+ __le32 a_version;
+} ext4_acl_header;
+
+static inline size_t ext4_acl_size(int count)
+{
+ if (count <= 4) {
+ return sizeof(ext4_acl_header) +
+ count * sizeof(ext4_acl_entry_short);
+ } else {
+ return sizeof(ext4_acl_header) +
+ 4 * sizeof(ext4_acl_entry_short) +
+ (count - 4) * sizeof(ext4_acl_entry);
+ }
+}
+
+static inline int ext4_acl_count(size_t size)
+{
+ ssize_t s;
+ size -= sizeof(ext4_acl_header);
+ s = size - 4 * sizeof(ext4_acl_entry_short);
+ if (s < 0) {
+ if (size % sizeof(ext4_acl_entry_short))
+ return -1;
+ return size / sizeof(ext4_acl_entry_short);
+ } else {
+ if (s % sizeof(ext4_acl_entry))
+ return -1;
+ return s / sizeof(ext4_acl_entry) + 4;
+ }
+}
+
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+
+/* acl.c */
+struct posix_acl *ext4_get_acl(struct inode *inode, int type);
+extern int ext4_acl_chmod(struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
+
+#else /* CONFIG_EXT4_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext4_get_acl NULL
+
+static inline int
+ext4_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+static inline int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+#endif /* CONFIG_EXT4_FS_POSIX_ACL */
+
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 00000000..8da837be
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,766 @@
+/*
+ * linux/fs/ext4/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+
+#include <trace/events/ext4.h>
+
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group);
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
+ */
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ ext4_grpblk_t offset;
+
+ blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+ offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+ EXT4_SB(sb)->s_cluster_bits;
+ if (offsetp)
+ *offsetp = offset;
+ if (blockgrpp)
+ *blockgrpp = blocknr;
+
+}
+
+static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+ ext4_group_t block_group)
+{
+ ext4_group_t actual_group;
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
+ if (actual_group == block_group)
+ return 1;
+ return 0;
+}
+
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ unsigned num_clusters;
+ int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+ ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+ ext4_fsblk_t itbl_blk;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /* This is the number of clusters used by the superblock,
+ * block group descriptors, and reserved block group
+ * descriptor blocks */
+ num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+ /*
+ * For the allocation bitmaps and inode table, we first need
+ * to check to see if the block is in the block group. If it
+ * is, then check to see if the cluster is already accounted
+ * for in the clusters used for the base metadata cluster, or
+ * if we can increment the base metadata cluster to include
+ * that block. Otherwise, we will have to track the cluster
+ * used for the allocation bitmap or inode table explicitly.
+ * Normally all of these blocks are contiguous, so the special
+ * case handling shouldn't be necessary except for *very*
+ * unusual file system layouts.
+ */
+ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+ block_cluster = EXT4_B2C(sbi,
+ ext4_block_bitmap(sb, gdp) - start);
+ if (block_cluster < num_clusters)
+ block_cluster = -1;
+ else if (block_cluster == num_clusters) {
+ num_clusters++;
+ block_cluster = -1;
+ }
+ }
+
+ if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+ inode_cluster = EXT4_B2C(sbi,
+ ext4_inode_bitmap(sb, gdp) - start);
+ if (inode_cluster < num_clusters)
+ inode_cluster = -1;
+ else if (inode_cluster == num_clusters) {
+ num_clusters++;
+ inode_cluster = -1;
+ }
+ }
+
+ itbl_blk = ext4_inode_table(sb, gdp);
+ for (i = 0; i < sbi->s_itb_per_group; i++) {
+ if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+ c = EXT4_B2C(sbi, itbl_blk + i - start);
+ if ((c < num_clusters) || (c == inode_cluster) ||
+ (c == block_cluster) || (c == itbl_cluster))
+ continue;
+ if (c == num_clusters) {
+ num_clusters++;
+ continue;
+ }
+ num_clusters++;
+ itbl_cluster = c;
+ }
+ }
+
+ if (block_cluster != -1)
+ num_clusters++;
+ if (inode_cluster != -1)
+ num_clusters++;
+
+ return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ unsigned int blocks;
+
+ if (block_group == ext4_get_groups_count(sb) - 1) {
+ /*
+ * Even though mke2fs always initializes the first and
+ * last group, just in case some other tool was used,
+ * we need to make sure we calculate the right free
+ * blocks.
+ */
+ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, block_group);
+ } else
+ blocks = EXT4_BLOCKS_PER_GROUP(sb);
+ return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
+
+/* Initializes an uninitialized block bitmap */
+void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ unsigned int bit, bit_max;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start, tmp;
+ int flex_bg = 0;
+
+ J_ASSERT_BH(bh, buffer_locked(bh));
+
+ /* If checksum is bad mark all blocks used to prevent allocation
+ * essentially implementing a per-group read-only flag. */
+ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
+ ext4_free_group_clusters_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
+ memset(bh->b_data, 0xff, sb->s_blocksize);
+ return;
+ }
+ memset(bh->b_data, 0, sb->s_blocksize);
+
+ bit_max = ext4_num_base_meta_clusters(sb, block_group);
+ for (bit = 0; bit < bit_max; bit++)
+ ext4_set_bit(bit, bh->b_data);
+
+ start = ext4_group_first_block_no(sb, block_group);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ flex_bg = 1;
+
+ /* Set bits for block and inode bitmaps, and inode table */
+ tmp = ext4_block_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+
+ tmp = ext4_inode_bitmap(sb, gdp);
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+
+ tmp = ext4_inode_table(sb, gdp);
+ for (; tmp < ext4_inode_table(sb, gdp) +
+ sbi->s_itb_per_group; tmp++) {
+ if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+ ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+ }
+
+ /*
+ * Also if the number of blocks within the group is less than
+ * the blocksize * 8 ( which is the size of bitmap ), set rest
+ * of the block bitmap to 1
+ */
+ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+ sb->s_blocksize * 8, bh->b_data);
+}
+
+/* Return the number of free blocks in a block group. It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ return num_clusters_in_group(sb, block_group) -
+ ext4_num_overhead_clusters(sb, block_group, gdp);
+}
+
+/*
+ * The free blocks are managed by bitmaps. A file system contains several
+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block. Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block. The descriptors are loaded in memory
+ * when a file system is mounted (see ext4_fill_super).
+ */
+
+/**
+ * ext4_get_group_desc() -- load group descriptor from disk
+ * @sb: super block
+ * @block_group: given block group
+ * @bh: pointer to the buffer head to store the block
+ * group descriptor
+ */
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head **bh)
+{
+ unsigned int group_desc;
+ unsigned int offset;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (block_group >= ngroups) {
+ ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+ " groups_count = %u", block_group, ngroups);
+
+ return NULL;
+ }
+
+ group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+ offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+ if (!sbi->s_group_desc[group_desc]) {
+ ext4_error(sb, "Group descriptor not loaded - "
+ "block_group = %u, group_desc = %u, desc = %u",
+ block_group, group_desc, offset);
+ return NULL;
+ }
+
+ desc = (struct ext4_group_desc *)(
+ (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+ offset * EXT4_DESC_SIZE(sb));
+ if (bh)
+ *bh = sbi->s_group_desc[group_desc];
+ return desc;
+}
+
+static int ext4_valid_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ unsigned int block_group,
+ struct buffer_head *bh)
+{
+ ext4_grpblk_t offset;
+ ext4_grpblk_t next_zero_bit;
+ ext4_fsblk_t bitmap_blk;
+ ext4_fsblk_t group_first_block;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ /* with FLEX_BG, the inode/block bitmaps and itable
+ * blocks may not be in the group at all
+ * so the bitmap validation will be skipped for those groups
+ * or it has to also read the block group where the bitmaps
+ * are located to verify they are set.
+ */
+ return 1;
+ }
+ group_first_block = ext4_group_first_block_no(sb, block_group);
+
+ /* check whether block bitmap block number is set */
+ bitmap_blk = ext4_block_bitmap(sb, desc);
+ offset = bitmap_blk - group_first_block;
+ if (!ext4_test_bit(offset, bh->b_data))
+ /* bad block bitmap */
+ goto err_out;
+
+ /* check whether the inode bitmap block number is set */
+ bitmap_blk = ext4_inode_bitmap(sb, desc);
+ offset = bitmap_blk - group_first_block;
+ if (!ext4_test_bit(offset, bh->b_data))
+ /* bad block bitmap */
+ goto err_out;
+
+ /* check whether the inode table block number is set */
+ bitmap_blk = ext4_inode_table(sb, desc);
+ offset = bitmap_blk - group_first_block;
+ next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
+ offset + EXT4_SB(sb)->s_itb_per_group,
+ offset);
+ if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
+ /* good bitmap for inode tables */
+ return 1;
+
+err_out:
+ ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
+ block_group, bitmap_blk);
+ return 0;
+}
+/**
+ * ext4_read_block_bitmap()
+ * @sb: super block
+ * @block_group: given block group
+ *
+ * Read the bitmap for a given block_group,and validate the
+ * bits for block/inode/inode tables are set in the bitmaps
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+struct buffer_head *
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+{
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ ext4_fsblk_t bitmap_blk;
+
+ desc = ext4_get_group_desc(sb, block_group, NULL);
+ if (!desc)
+ return NULL;
+ bitmap_blk = ext4_block_bitmap(sb, desc);
+ bh = sb_getblk(sb, bitmap_blk);
+ if (unlikely(!bh)) {
+ ext4_error(sb, "Cannot get buffer for block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, bitmap_blk);
+ return NULL;
+ }
+
+ if (bitmap_uptodate(bh))
+ return bh;
+
+ lock_buffer(bh);
+ if (bitmap_uptodate(bh)) {
+ unlock_buffer(bh);
+ return bh;
+ }
+ ext4_lock_group(sb, block_group);
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ ext4_init_block_bitmap(sb, bh, block_group, desc);
+ set_bitmap_uptodate(bh);
+ set_buffer_uptodate(bh);
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
+ return bh;
+ }
+ ext4_unlock_group(sb, block_group);
+ if (buffer_uptodate(bh)) {
+ /*
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
+ */
+ set_bitmap_uptodate(bh);
+ unlock_buffer(bh);
+ return bh;
+ }
+ /*
+ * submit the buffer_head for reading
+ */
+ set_buffer_new(bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ, bh);
+ return bh;
+}
+
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ struct ext4_group_desc *desc;
+
+ if (!buffer_new(bh))
+ return 0;
+ desc = ext4_get_group_desc(sb, block_group, NULL);
+ if (!desc)
+ return 1;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ ext4_error(sb, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
+ return 1;
+ }
+ clear_buffer_new(bh);
+ /* Panic or remount fs read-only if block bitmap is invalid */
+ ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ return 0;
+}
+
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+ struct buffer_head *bh;
+
+ bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ put_bh(bh);
+ return NULL;
+ }
+ return bh;
+}
+
+/**
+ * ext4_has_free_clusters()
+ * @sbi: in-core super block structure.
+ * @nclusters: number of needed blocks
+ * @flags: flags from ext4_mb_new_blocks()
+ *
+ * Check if filesystem has nclusters free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
+{
+ s64 free_clusters, dirty_clusters, root_clusters;
+ struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+ struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
+
+ free_clusters = percpu_counter_read_positive(fcc);
+ dirty_clusters = percpu_counter_read_positive(dcc);
+ root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es));
+
+ if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
+ EXT4_FREECLUSTERS_WATERMARK) {
+ free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc));
+ dirty_clusters = percpu_counter_sum_positive(dcc);
+ }
+ /* Check whether we have space after accounting for current
+ * dirty clusters & root reserved clusters.
+ */
+ if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
+ return 1;
+
+ /* Hm, nope. Are (enough) root reserved clusters available? */
+ if (sbi->s_resuid == current_fsuid() ||
+ ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+ capable(CAP_SYS_RESOURCE) ||
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
+ if (free_clusters >= (nclusters + dirty_clusters))
+ return 1;
+ }
+
+ return 0;
+}
+
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags)
+{
+ if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
+ return 0;
+ } else
+ return -ENOSPC;
+}
+
+/**
+ * ext4_should_retry_alloc()
+ * @sb: super block
+ * @retries number of attemps has been made
+ *
+ * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or committing transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+{
+ if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
+ (*retries)++ > 3 ||
+ !EXT4_SB(sb)->s_journal)
+ return 0;
+
+ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+ return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+}
+
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: pointer to total number of clusters needed
+ * @errp: error code
+ *
+ * Return 1st allocated block number on success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, unsigned int flags,
+ unsigned long *count, int *errp)
+{
+ struct ext4_allocation_request ar;
+ ext4_fsblk_t ret;
+
+ memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+ ar.inode = inode;
+ ar.goal = goal;
+ ar.len = count ? *count : 1;
+ ar.flags = flags;
+
+ ret = ext4_mb_new_blocks(handle, &ar, errp);
+ if (count)
+ *count = ar.len;
+ /*
+ * Account for the allocated meta blocks. We will never
+ * fail EDQUOT for metdata, but we do account for it.
+ */
+ if (!(*errp) &&
+ ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ dquot_alloc_block_nofail(inode,
+ EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
+ }
+ return ret;
+}
+
+/**
+ * ext4_count_free_clusters() -- count filesystem free clusters
+ * @sb: superblock
+ *
+ * Adds up the number of free clusters from each block group.
+ */
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
+{
+ ext4_fsblk_t desc_count;
+ struct ext4_group_desc *gdp;
+ ext4_group_t i;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+#ifdef EXT4FS_DEBUG
+ struct ext4_super_block *es;
+ ext4_fsblk_t bitmap_count;
+ unsigned int x;
+ struct buffer_head *bitmap_bh = NULL;
+
+ es = EXT4_SB(sb)->s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += ext4_free_group_clusters(sb, gdp);
+ brelse(bitmap_bh);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
+ if (bitmap_bh == NULL)
+ continue;
+
+ x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+ printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+ i, ext4_free_group_clusters(sb, gdp), x);
+ bitmap_count += x;
+ }
+ brelse(bitmap_bh);
+ printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+ ", computed = %llu, %llu\n",
+ EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+ desc_count, bitmap_count);
+ return bitmap_count;
+#else
+ desc_count = 0;
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += ext4_free_group_clusters(sb, gdp);
+ }
+
+ return desc_count;
+#endif
+}
+
+static inline int test_root(ext4_group_t a, int b)
+{
+ int num = b;
+
+ while (a > num)
+ num *= b;
+ return num == a;
+}
+
+static int ext4_group_sparse(ext4_group_t group)
+{
+ if (group <= 1)
+ return 1;
+ if (!(group & 1))
+ return 0;
+ return (test_root(group, 7) || test_root(group, 5) ||
+ test_root(group, 3));
+}
+
+/**
+ * ext4_bg_has_super - number of blocks used by the superblock in group
+ * @sb: superblock for filesystem
+ * @group: group number to check
+ *
+ * Return the number of blocks used by the superblock (primary or backup)
+ * in this group. Currently this will be only 0 or 1.
+ */
+int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+ !ext4_group_sparse(group))
+ return 0;
+ return 1;
+}
+
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
+ ext4_group_t group)
+{
+ unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+ ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+ ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+
+ if (group == first || group == first + 1 || group == last)
+ return 1;
+ return 0;
+}
+
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
+ ext4_group_t group)
+{
+ if (!ext4_bg_has_super(sb, group))
+ return 0;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
+}
+
+/**
+ * ext4_bg_num_gdb - number of blocks used by the group table in group
+ * @sb: superblock for filesystem
+ * @group: group number to check
+ *
+ * Return the number of blocks used by the group descriptor table
+ * (primary or backup) in this group. In the future there may be a
+ * different number of descriptor blocks in each group.
+ */
+unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
+{
+ unsigned long first_meta_bg =
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
+ metagroup < first_meta_bg)
+ return ext4_bg_num_gdb_nometa(sb, group);
+
+ return ext4_bg_num_gdb_meta(sb,group);
+
+}
+
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned num;
+
+ /* Check for superblock and gdt backups in this group */
+ num = ext4_bg_has_super(sb, block_group);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+ sbi->s_desc_per_block) {
+ if (num) {
+ num += ext4_bg_num_gdb(sb, block_group);
+ num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+ }
+ } else { /* For META_BG_BLOCK_GROUPS */
+ num += ext4_bg_num_gdb(sb, block_group);
+ }
+ return EXT4_NUM_B2C(sbi, num);
+}
+/**
+ * ext4_inode_to_goal_block - return a hint for block allocation
+ * @inode: inode for block allocation
+ *
+ * Return the ideal location to start allocating blocks for a
+ * newly created inode.
+ */
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_group_t block_group;
+ ext4_grpblk_t colour;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+ ext4_fsblk_t bg_start;
+ ext4_fsblk_t last_block;
+
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+ last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
+ if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ else
+ colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+ return bg_start + colour;
+}
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 00000000..fa3af81a
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,31 @@
+/*
+ * linux/fs/ext4/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#ifdef EXT4FS_DEBUG
+
+static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
+{
+ unsigned int i, sum = 0;
+
+ if (!map)
+ return 0;
+ for (i = 0; i < numchars; i++)
+ sum += nibblemap[map->b_data[i] & 0xf] +
+ nibblemap[(map->b_data[i] >> 4) & 0xf];
+ return sum;
+}
+
+#endif /* EXT4FS_DEBUG */
+
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 00000000..3f11656b
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,268 @@
+/*
+ * linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+ struct rb_node node;
+ ext4_fsblk_t start_blk;
+ unsigned int count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init ext4_init_system_zone(void)
+{
+ ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
+ if (ext4_system_zone_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_system_zone(void)
+{
+ kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+ struct ext4_system_zone *entry2)
+{
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ return 1;
+ return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+ ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *new_entry = NULL, *entry;
+ struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node = NULL;
+
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_system_zone, node);
+ if (start_blk < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ if (start_blk + count > (entry->start_blk +
+ entry->count))
+ entry->count = (start_blk + count -
+ entry->start_blk);
+ new_node = *n;
+ new_entry = rb_entry(new_node, struct ext4_system_zone,
+ node);
+ break;
+ }
+ }
+
+ if (!new_entry) {
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &sbi->system_blks);
+ }
+
+ /* Can we merge to the left? */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &sbi->system_blks);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ }
+ }
+
+ /* Can we merge to the right? */
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &sbi->system_blks);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ }
+ }
+ return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+ struct rb_node *node;
+ struct ext4_system_zone *entry;
+ int first = 1;
+
+ printk(KERN_INFO "System zones: ");
+ node = rb_first(&sbi->system_blks);
+ while (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ printk("%s%llu-%llu", first ? "" : ", ",
+ entry->start_blk, entry->start_blk + entry->count - 1);
+ first = 0;
+ node = rb_next(node);
+ }
+ printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp;
+ ext4_group_t i;
+ int flex_size = ext4_flex_bg_size(sbi);
+ int ret;
+
+ if (!test_opt(sb, BLOCK_VALIDITY)) {
+ if (EXT4_SB(sb)->system_blks.rb_node)
+ ext4_release_system_zone(sb);
+ return 0;
+ }
+ if (EXT4_SB(sb)->system_blks.rb_node)
+ return 0;
+
+ for (i=0; i < ngroups; i++) {
+ if (ext4_bg_has_super(sb, i) &&
+ ((i < 5) || ((i % flex_size) == 0)))
+ add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+ ext4_bg_num_gdb(sb, i) + 1);
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+ if (ret)
+ return ret;
+ ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+ if (ret)
+ return ret;
+ ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+ sbi->s_itb_per_group);
+ if (ret)
+ return ret;
+ }
+
+ if (test_opt(sb, DEBUG))
+ debug_print_tree(EXT4_SB(sb));
+ return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+ struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
+ struct rb_node *parent;
+ struct ext4_system_zone *entry;
+
+ while (n) {
+ /* Do the node's children first */
+ if (n->rb_left) {
+ n = n->rb_left;
+ continue;
+ }
+ if (n->rb_right) {
+ n = n->rb_right;
+ continue;
+ }
+ /*
+ * The node has no children; free it, and then zero
+ * out parent's link to it. Finally go to the
+ * beginning of the loop and try to free the parent
+ * node.
+ */
+ parent = rb_parent(n);
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ if (!parent)
+ EXT4_SB(sb)->system_blks = RB_ROOT;
+ else if (parent->rb_left == n)
+ parent->rb_left = NULL;
+ else if (parent->rb_right == n)
+ parent->rb_right = NULL;
+ n = parent;
+ }
+ EXT4_SB(sb)->system_blks = RB_ROOT;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *entry;
+ struct rb_node *n = sbi->system_blks.rb_node;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int ext4_check_blockref(const char *function, unsigned int line,
+ struct inode *inode, __le32 *p, unsigned int max)
+{
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ __le32 *bref = p;
+ unsigned int blk;
+
+ while (bref < p+max) {
+ blk = le32_to_cpu(*bref++);
+ if (blk &&
+ unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ blk, 1))) {
+ es->s_last_error_block = cpu_to_le64(blk);
+ ext4_error_inode(inode, function, line, blk,
+ "invalid block");
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 00000000..b8678620
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,667 @@
+/*
+ * linux/fs/ext4/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/dir.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4 directory handling functions
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001 Daniel Phillips
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include "ext4.h"
+
+static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ext4_dx_readdir(struct file *filp,
+ void *dirent, filldir_t filldir);
+
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+ return (ext4_filetype_table[filetype]);
+}
+
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which chould potentially get coverted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ */
+int __ext4_check_dir_entry(const char *function, unsigned int line,
+ struct inode *dir, struct file *filp,
+ struct ext4_dir_entry_2 *de,
+ struct buffer_head *bh,
+ unsigned int offset)
+{
+ const char *error_msg = NULL;
+ const int rlen = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
+
+ if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
+ error_msg = "rec_len is smaller than minimal";
+ else if (unlikely(rlen % 4 != 0))
+ error_msg = "rec_len % 4 != 0";
+ else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
+ error_msg = "rec_len is too small for name_len";
+ else if (unlikely(((char *) de - bh->b_data) + rlen >
+ dir->i_sb->s_blocksize))
+ error_msg = "directory entry across blocks";
+ else if (unlikely(le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
+ error_msg = "inode out of bounds";
+ else
+ return 0;
+
+ if (filp)
+ ext4_error_file(filp, function, line, bh->b_blocknr,
+ "bad entry in directory: %s - offset=%u(%u), "
+ "inode=%u, rec_len=%d, name_len=%d",
+ error_msg, (unsigned) (offset % bh->b_size),
+ offset, le32_to_cpu(de->inode),
+ rlen, de->name_len);
+ else
+ ext4_error_inode(dir, function, line, bh->b_blocknr,
+ "bad entry in directory: %s - offset=%u(%u), "
+ "inode=%u, rec_len=%d, name_len=%d",
+ error_msg, (unsigned) (offset % bh->b_size),
+ offset, le32_to_cpu(de->inode),
+ rlen, de->name_len);
+
+ return 1;
+}
+
+static int ext4_readdir(struct file *filp,
+ void *dirent, filldir_t filldir)
+{
+ int error = 0;
+ unsigned int offset;
+ int i, stored;
+ struct ext4_dir_entry_2 *de;
+ int err;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ int ret = 0;
+ int dir_has_error = 0;
+
+ if (is_dx_dir(inode)) {
+ err = ext4_dx_readdir(filp, dirent, filldir);
+ if (err != ERR_BAD_DX_DIR) {
+ ret = err;
+ goto out;
+ }
+ /*
+ * We don't set the inode dirty flag since it's not
+ * critical that it get flushed back to the disk.
+ */
+ ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
+ EXT4_INODE_INDEX);
+ }
+ stored = 0;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+
+ while (!error && !stored && filp->f_pos < inode->i_size) {
+ struct ext4_map_blocks map;
+ struct buffer_head *bh = NULL;
+
+ map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_len = 1;
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err > 0) {
+ pgoff_t index = map.m_pblk >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ if (!ra_has_index(&filp->f_ra, index))
+ page_cache_sync_readahead(
+ sb->s_bdev->bd_inode->i_mapping,
+ &filp->f_ra, filp,
+ index, 1);
+ filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
+ }
+
+ /*
+ * We ignore I/O errors on directories so users have a chance
+ * of recovering data when there's a bad sector
+ */
+ if (!bh) {
+ if (!dir_has_error) {
+ EXT4_ERROR_FILE(filp, 0,
+ "directory contains a "
+ "hole at offset %llu",
+ (unsigned long long) filp->f_pos);
+ dir_has_error = 1;
+ }
+ /* corrupt size? Maybe no more blocks to read */
+ if (filp->f_pos > inode->i_blocks << 9)
+ break;
+ filp->f_pos += sb->s_blocksize - offset;
+ continue;
+ }
+
+revalidate:
+ /* If the dir block has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the block
+ * to make sure. */
+ if (filp->f_version != inode->i_version) {
+ for (i = 0; i < sb->s_blocksize && i < offset; ) {
+ de = (struct ext4_dir_entry_2 *)
+ (bh->b_data + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
+ }
+ offset = i;
+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ | offset;
+ filp->f_version = inode->i_version;
+ }
+
+ while (!error && filp->f_pos < inode->i_size
+ && offset < sb->s_blocksize) {
+ de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
+ if (ext4_check_dir_entry(inode, filp, de,
+ bh, offset)) {
+ /*
+ * On error, skip the f_pos to the next block
+ */
+ filp->f_pos = (filp->f_pos |
+ (sb->s_blocksize - 1)) + 1;
+ brelse(bh);
+ ret = stored;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
+ if (le32_to_cpu(de->inode)) {
+ /* We might block in the next section
+ * if the data destination is
+ * currently swapped out. So, use a
+ * version stamp to detect whether or
+ * not the directory has been modified
+ * during the copy operation.
+ */
+ u64 version = filp->f_version;
+
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+ goto revalidate;
+ stored++;
+ }
+ filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
+ }
+ offset = 0;
+ brelse(bh);
+ }
+out:
+ return ret;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return is_compat_task();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value for dx directories
+ *
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
+ */
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return major >> 1;
+ else
+ return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return (pos << 1) & 0xffffffff;
+ else
+ return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return 0;
+ else
+ return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return EXT4_HTREE_EOF_32BIT;
+ else
+ return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() based on generic_file_llseek() to handle both
+ * non-htree and htree directories, where the "offset" is in terms
+ * of the filename hash value instead of the byte offset.
+ *
+ * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
+ * will be invalid once the directory was converted into a dx directory
+ */
+loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t ret = -EINVAL;
+ int dx_dir = is_dx_dir(inode);
+
+ mutex_lock(&inode->i_mutex);
+
+ /* NOTE: relative offsets with dx directories might not work
+ * as expected, as it is difficult to figure out the
+ * correct offset between dx hashes */
+
+ switch (origin) {
+ case SEEK_END:
+ if (unlikely(offset > 0))
+ goto out_err; /* not supported for directories */
+
+ /* so only negative offsets are left, does that have a
+ * meaning for directories at all? */
+ if (dx_dir)
+ offset += ext4_get_htree_eof(file);
+ else
+ offset += inode->i_size;
+ break;
+ case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0) {
+ offset = file->f_pos;
+ goto out_ok;
+ }
+
+ offset += file->f_pos;
+ break;
+ }
+
+ if (unlikely(offset < 0))
+ goto out_err;
+
+ if (!dx_dir) {
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_err;
+ } else if (offset > ext4_get_htree_eof(file))
+ goto out_err;
+
+ /* Special lock needed here? */
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+
+out_ok:
+ ret = offset;
+out_err:
+ mutex_unlock(&inode->i_mutex);
+
+ return ret;
+}
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+ __u32 hash;
+ __u32 minor_hash;
+ struct rb_node rb_hash;
+ struct fname *next;
+ __u32 inode;
+ __u8 name_len;
+ __u8 file_type;
+ char name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *parent;
+ struct fname *fname;
+
+ while (n) {
+ /* Do the node's children first */
+ if (n->rb_left) {
+ n = n->rb_left;
+ continue;
+ }
+ if (n->rb_right) {
+ n = n->rb_right;
+ continue;
+ }
+ /*
+ * The node has no children; free it, and then zero
+ * out parent's link to it. Finally go to the
+ * beginning of the loop and try to free the parent
+ * node.
+ */
+ parent = rb_parent(n);
+ fname = rb_entry(n, struct fname, rb_hash);
+ while (fname) {
+ struct fname *old = fname;
+ fname = fname->next;
+ kfree(old);
+ }
+ if (!parent)
+ *root = RB_ROOT;
+ else if (parent->rb_left == n)
+ parent->rb_left = NULL;
+ else if (parent->rb_right == n)
+ parent->rb_right = NULL;
+ n = parent;
+ }
+}
+
+
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+ loff_t pos)
+{
+ struct dir_private_info *p;
+
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ if (!p)
+ return NULL;
+ p->curr_hash = pos2maj_hash(filp, pos);
+ p->curr_minor_hash = pos2min_hash(filp, pos);
+ return p;
+}
+
+void ext4_htree_free_dir_info(struct dir_private_info *p)
+{
+ free_rb_tree_fname(&p->root);
+ kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext4_dir_entry_2 *dirent)
+{
+ struct rb_node **p, *parent = NULL;
+ struct fname *fname, *new_fn;
+ struct dir_private_info *info;
+ int len;
+
+ info = dir_file->private_data;
+ p = &info->root.rb_node;
+
+ /* Create and allocate the fname structure */
+ len = sizeof(struct fname) + dirent->name_len + 1;
+ new_fn = kzalloc(len, GFP_KERNEL);
+ if (!new_fn)
+ return -ENOMEM;
+ new_fn->hash = hash;
+ new_fn->minor_hash = minor_hash;
+ new_fn->inode = le32_to_cpu(dirent->inode);
+ new_fn->name_len = dirent->name_len;
+ new_fn->file_type = dirent->file_type;
+ memcpy(new_fn->name, dirent->name, dirent->name_len);
+ new_fn->name[dirent->name_len] = 0;
+
+ while (*p) {
+ parent = *p;
+ fname = rb_entry(parent, struct fname, rb_hash);
+
+ /*
+ * If the hash and minor hash match up, then we put
+ * them on a linked list. This rarely happens...
+ */
+ if ((new_fn->hash == fname->hash) &&
+ (new_fn->minor_hash == fname->minor_hash)) {
+ new_fn->next = fname->next;
+ fname->next = new_fn;
+ return 0;
+ }
+
+ if (new_fn->hash < fname->hash)
+ p = &(*p)->rb_left;
+ else if (new_fn->hash > fname->hash)
+ p = &(*p)->rb_right;
+ else if (new_fn->minor_hash < fname->minor_hash)
+ p = &(*p)->rb_left;
+ else /* if (new_fn->minor_hash > fname->minor_hash) */
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&new_fn->rb_hash, parent, p);
+ rb_insert_color(&new_fn->rb_hash, &info->root);
+ return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext4_dx_readdir. It calls filldir
+ * for all entres on the fname linked list. (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file *filp, void *dirent,
+ filldir_t filldir, struct fname *fname)
+{
+ struct dir_private_info *info = filp->private_data;
+ loff_t curr_pos;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *sb;
+ int error;
+
+ sb = inode->i_sb;
+
+ if (!fname) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+ "called with null fname?!?", __func__, __LINE__,
+ inode->i_ino, current->comm);
+ return 0;
+ }
+ curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
+ while (fname) {
+ error = filldir(dirent, fname->name,
+ fname->name_len, curr_pos,
+ fname->inode,
+ get_dtype(sb, fname->file_type));
+ if (error) {
+ filp->f_pos = curr_pos;
+ info->extra_fname = fname;
+ return error;
+ }
+ fname = fname->next;
+ }
+ return 0;
+}
+
+static int ext4_dx_readdir(struct file *filp,
+ void *dirent, filldir_t filldir)
+{
+ struct dir_private_info *info = filp->private_data;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct fname *fname;
+ int ret;
+
+ if (!info) {
+ info = ext4_htree_create_dir_info(filp, filp->f_pos);
+ if (!info)
+ return -ENOMEM;
+ filp->private_data = info;
+ }
+
+ if (filp->f_pos == ext4_get_htree_eof(filp))
+ return 0; /* EOF */
+
+ /* Some one has messed with f_pos; reset the world */
+ if (info->last_pos != filp->f_pos) {
+ free_rb_tree_fname(&info->root);
+ info->curr_node = NULL;
+ info->extra_fname = NULL;
+ info->curr_hash = pos2maj_hash(filp, filp->f_pos);
+ info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
+ }
+
+ /*
+ * If there are any leftover names on the hash collision
+ * chain, return them first.
+ */
+ if (info->extra_fname) {
+ if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ goto finished;
+ info->extra_fname = NULL;
+ goto next_node;
+ } else if (!info->curr_node)
+ info->curr_node = rb_first(&info->root);
+
+ while (1) {
+ /*
+ * Fill the rbtree if we have no more entries,
+ * or the inode has changed since we last read in the
+ * cached entries.
+ */
+ if ((!info->curr_node) ||
+ (filp->f_version != inode->i_version)) {
+ info->curr_node = NULL;
+ free_rb_tree_fname(&info->root);
+ filp->f_version = inode->i_version;
+ ret = ext4_htree_fill_tree(filp, info->curr_hash,
+ info->curr_minor_hash,
+ &info->next_hash);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ filp->f_pos = ext4_get_htree_eof(filp);
+ break;
+ }
+ info->curr_node = rb_first(&info->root);
+ }
+
+ fname = rb_entry(info->curr_node, struct fname, rb_hash);
+ info->curr_hash = fname->hash;
+ info->curr_minor_hash = fname->minor_hash;
+ if (call_filldir(filp, dirent, filldir, fname))
+ break;
+ next_node:
+ info->curr_node = rb_next(info->curr_node);
+ if (info->curr_node) {
+ fname = rb_entry(info->curr_node, struct fname,
+ rb_hash);
+ info->curr_hash = fname->hash;
+ info->curr_minor_hash = fname->minor_hash;
+ } else {
+ if (info->next_hash == ~0) {
+ filp->f_pos = ext4_get_htree_eof(filp);
+ break;
+ }
+ info->curr_hash = info->next_hash;
+ info->curr_minor_hash = 0;
+ }
+ }
+finished:
+ info->last_pos = filp->f_pos;
+ return 0;
+}
+
+static int ext4_release_dir(struct inode *inode, struct file *filp)
+{
+ if (filp->private_data)
+ ext4_htree_free_dir_info(filp->private_data);
+
+ return 0;
+}
+
+const struct file_operations ext4_dir_operations = {
+ .llseek = ext4_dir_llseek,
+ .read = generic_read_dir,
+ .readdir = ext4_readdir,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .fsync = ext4_sync_file,
+ .release = ext4_release_dir,
+};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
new file mode 100644
index 00000000..0e01e90a
--- /dev/null
+++ b/fs/ext4/ext4.h
@@ -0,0 +1,2372 @@
+/*
+ * ext4.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/include/linux/minix_fs.h
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#ifndef _EXT4_H
+#define _EXT4_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/magic.h>
+#include <linux/jbd2.h>
+#include <linux/quota.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
+
+/*
+ * The fourth extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+#undef EXT4FS_DEBUG
+
+/*
+ * Debug code
+ */
+#ifdef EXT4FS_DEBUG
+#define ext4_debug(f, a...) \
+ do { \
+ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk(KERN_DEBUG f, ## a); \
+ } while (0)
+#else
+#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
+ ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...) \
+ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE 0x0001
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED 0x0002
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA 0x0004
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST 0x0008
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST 0x0010
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA 0x0020
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC 0x0040
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY 0x0100
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL 0x0200
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
+
+struct ext4_allocation_request {
+ /* target inode for block we're allocating */
+ struct inode *inode;
+ /* how many blocks we want to allocate */
+ unsigned int len;
+ /* logical block in target inode */
+ ext4_lblk_t logical;
+ /* the closest logical allocated block to the left */
+ ext4_lblk_t lleft;
+ /* the closest logical allocated block to the right */
+ ext4_lblk_t lright;
+ /* phys. target (a hint) */
+ ext4_fsblk_t goal;
+ /* phys. block for the closest logical allocated block to the left */
+ ext4_fsblk_t pleft;
+ /* phys. block for the closest logical allocated block to the right */
+ ext4_fsblk_t pright;
+ /* flags. see above EXT4_MB_HINT_* */
+ unsigned int flags;
+};
+
+/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks(). It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW (1 << BH_New)
+#define EXT4_MAP_MAPPED (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT (1 << BH_Uninit)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
+#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
+
+struct ext4_map_blocks {
+ ext4_fsblk_t m_pblk;
+ ext4_lblk_t m_lblk;
+ unsigned int m_len;
+ unsigned int m_flags;
+};
+
+/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+ struct inode *inode;
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
+ unsigned long first_page, next_page; /* extent of pages */
+ struct writeback_control *wbc;
+ int io_done;
+ int pages_written;
+ int retval;
+};
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_ERROR 0x0002
+#define EXT4_IO_END_QUEUED 0x0004
+#define EXT4_IO_END_DIRECT 0x0008
+#define EXT4_IO_END_IN_FSYNC 0x0010
+
+struct ext4_io_page {
+ struct page *p_page;
+ atomic_t p_count;
+};
+
+#define MAX_IO_PAGES 128
+
+/*
+ * For converting uninitialized extents on a work queue.
+ *
+ * 'page' is only used from the writepage() path; 'pages' is only used for
+ * buffered writes; they are used to keep page references until conversion
+ * takes place. For AIO/DIO, neither field is filled in.
+ */
+typedef struct ext4_io_end {
+ struct list_head list; /* per-file finished IO list */
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* unwritten or not */
+ struct page *page; /* for writepage() path */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+ struct kiocb *iocb; /* iocb struct for AIO */
+ int result; /* error value for AIO */
+ int num_io_pages; /* for writepages() */
+ struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */
+} ext4_io_end_t;
+
+struct ext4_io_submit {
+ int io_op;
+ struct bio *io_bio;
+ ext4_io_end_t *io_end;
+ struct ext4_io_page *io_page;
+ sector_t io_next_block;
+};
+
+/*
+ * Special inodes numbers
+ */
+#define EXT4_BAD_INO 1 /* Bad blocks inode */
+#define EXT4_ROOT_INO 2 /* Root inode */
+#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
+#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
+#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
+#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
+#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
+#define EXT4_JOURNAL_INO 8 /* Journal inode */
+
+/* First non-reserved inode for old ext4 filesystems */
+#define EXT4_GOOD_OLD_FIRST_INO 11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT4_LINK_MAX 65000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT4_MIN_BLOCK_SIZE 1024
+#define EXT4_MAX_BLOCK_SIZE 65536
+#define EXT4_MIN_BLOCK_LOG_SIZE 10
+#define EXT4_MAX_BLOCK_LOG_SIZE 16
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize)
+#else
+# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+#endif
+#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \
+ EXT4_SB(s)->s_cluster_bits)
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits)
+#else
+# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10)
+#endif
+#ifdef __KERNEL__
+#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits)
+#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size)
+#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino)
+#else
+#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+ EXT4_GOOD_OLD_INODE_SIZE : \
+ (s)->s_inode_size)
+#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+ EXT4_GOOD_OLD_FIRST_INO : \
+ (s)->s_first_ino)
+#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits)))
+
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
+ (sbi)->s_cluster_bits)
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext4_group_desc
+{
+ __le32 bg_block_bitmap_lo; /* Blocks bitmap block */
+ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
+ __le32 bg_inode_table_lo; /* Inodes table block */
+ __le16 bg_free_blocks_count_lo;/* Free blocks count */
+ __le16 bg_free_inodes_count_lo;/* Free inodes count */
+ __le16 bg_used_dirs_count_lo; /* Directories count */
+ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
+ __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
+ __le16 bg_itable_unused_lo; /* Unused inodes count */
+ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
+ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
+ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
+ __le32 bg_inode_table_hi; /* Inodes table block MSB */
+ __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
+ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
+ __le16 bg_used_dirs_count_hi; /* Directories count MSB */
+ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
+ __u32 bg_reserved2[3];
+};
+
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ atomic_t free_inodes;
+ atomic_t free_clusters;
+ atomic_t used_dirs;
+};
+
+#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
+#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
+#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT4_MIN_DESC_SIZE 32
+#define EXT4_MIN_DESC_SIZE_64BIT 64
+#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE
+#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size)
+#ifdef __KERNEL__
+# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group)
+# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block)
+# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group)
+# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits)
+#else
+# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
+# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group)
+#endif
+
+/*
+ * Constants relative to the data blocks
+ */
+#define EXT4_NDIR_BLOCKS 12
+#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS
+#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1)
+#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1)
+#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */
+#define EXT4_UNRM_FL 0x00000002 /* Undelete */
+#define EXT4_COMPR_FL 0x00000004 /* Compress file */
+#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */
+#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */
+#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */
+#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */
+#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT4_DIRTY_FL 0x00000100
+#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */
+#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */
+#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */
+#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */
+#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */
+#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */
+#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
+#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
+
+#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT4_REG_FLMASK;
+ else
+ return flags & EXT4_OTHER_FLMASK;
+}
+
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+ EXT4_INODE_SECRM = 0, /* Secure deletion */
+ EXT4_INODE_UNRM = 1, /* Undelete */
+ EXT4_INODE_COMPR = 2, /* Compress file */
+ EXT4_INODE_SYNC = 3, /* Synchronous updates */
+ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
+ EXT4_INODE_APPEND = 5, /* writes to file may only append */
+ EXT4_INODE_NODUMP = 6, /* do not dump file */
+ EXT4_INODE_NOATIME = 7, /* do not update atime */
+/* Reserved for compression usage... */
+ EXT4_INODE_DIRTY = 8,
+ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
+ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
+ EXT4_INODE_ECOMPR = 11, /* Compression error */
+/* End compression flags --- maybe not all used */
+ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
+ EXT4_INODE_IMAGIC = 13, /* AFS directory */
+ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
+ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
+ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
+ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
+ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
+ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
+ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+ CHECK_FLAG_VALUE(SECRM);
+ CHECK_FLAG_VALUE(UNRM);
+ CHECK_FLAG_VALUE(COMPR);
+ CHECK_FLAG_VALUE(SYNC);
+ CHECK_FLAG_VALUE(IMMUTABLE);
+ CHECK_FLAG_VALUE(APPEND);
+ CHECK_FLAG_VALUE(NODUMP);
+ CHECK_FLAG_VALUE(NOATIME);
+ CHECK_FLAG_VALUE(DIRTY);
+ CHECK_FLAG_VALUE(COMPRBLK);
+ CHECK_FLAG_VALUE(NOCOMPR);
+ CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(INDEX);
+ CHECK_FLAG_VALUE(IMAGIC);
+ CHECK_FLAG_VALUE(JOURNAL_DATA);
+ CHECK_FLAG_VALUE(NOTAIL);
+ CHECK_FLAG_VALUE(DIRSYNC);
+ CHECK_FLAG_VALUE(TOPDIR);
+ CHECK_FLAG_VALUE(HUGE_FILE);
+ CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(EA_INODE);
+ CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(RESERVED);
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+ __u32 group; /* Group number for this data */
+ __u64 block_bitmap; /* Absolute block number of block bitmap */
+ __u64 inode_bitmap; /* Absolute block number of inode bitmap */
+ __u64 inode_table; /* Absolute block number of inode table start */
+ __u32 blocks_count; /* Total number of blocks in this group */
+ __u16 reserved_blocks; /* Number of reserved blocks in this group */
+ __u16 unused;
+};
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+ u32 group;
+ compat_u64 block_bitmap;
+ compat_u64 inode_bitmap;
+ compat_u64 inode_table;
+ u32 blocks_count;
+ u16 reserved_blocks;
+ u16 unused;
+};
+#endif
+
+/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+struct ext4_new_group_data {
+ __u32 group;
+ __u64 block_bitmap;
+ __u64 inode_bitmap;
+ __u64 inode_table;
+ __u32 blocks_count;
+ __u16 reserved_blocks;
+ __u16 unused;
+ __u32 free_blocks_count;
+};
+
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+ BLOCK_BITMAP = 0, /* block bitmap */
+ INODE_BITMAP, /* inode bitmap */
+ INODE_TABLE, /* inode tables */
+ GROUP_TABLE_COUNT,
+};
+
+/*
+ * Flags used by ext4_map_blocks()
+ */
+ /* Allocate any needed blocks and/or convert an unitialized
+ extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE 0x0001
+ /* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
+ EXT4_GET_BLOCKS_CREATE)
+ /* Caller is from the delayed allocation writeout path,
+ so set the magic i_delalloc_reserve_flag after taking the
+ inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
+ /* caller is from the direct IO path, request to creation of an
+ unitialized extents if not allocated, split the uninitialized
+ extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_PRE_IO 0x0008
+#define EXT4_GET_BLOCKS_CONVERT 0x0010
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
+ /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
+ /* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA 0x0001
+#define EXT4_FREE_BLOCKS_FORGET 0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+
+/*
+ * Flags used by ext4_discard_partial_page_buffers
+ */
+#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
+
+/*
+ * ioctl commands
+ */
+#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
+#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
+#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
+#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
+#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
+#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
+#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
+#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
+#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+#endif
+
+/* Max physical block we can address w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext4_inode {
+ __le16 i_mode; /* File mode */
+ __le16 i_uid; /* Low 16 bits of Owner Uid */
+ __le32 i_size_lo; /* Size in bytes */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Inode Change time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_dtime; /* Deletion Time */
+ __le16 i_gid; /* Low 16 bits of Group Id */
+ __le16 i_links_count; /* Links count */
+ __le32 i_blocks_lo; /* Blocks count */
+ __le32 i_flags; /* File flags */
+ union {
+ struct {
+ __le32 l_i_version;
+ } linux1;
+ struct {
+ __u32 h_i_translator;
+ } hurd1;
+ struct {
+ __u32 m_i_reserved1;
+ } masix1;
+ } osd1; /* OS dependent 1 */
+ __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_file_acl_lo; /* File ACL */
+ __le32 i_size_high;
+ __le32 i_obso_faddr; /* Obsoleted fragment address */
+ union {
+ struct {
+ __le16 l_i_blocks_high; /* were l_i_reserved1 */
+ __le16 l_i_file_acl_high;
+ __le16 l_i_uid_high; /* these 2 fields */
+ __le16 l_i_gid_high; /* were reserved2[0] */
+ __u32 l_i_reserved2;
+ } linux2;
+ struct {
+ __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
+ __u16 h_i_mode_high;
+ __u16 h_i_uid_high;
+ __u16 h_i_gid_high;
+ __u32 h_i_author;
+ } hurd2;
+ struct {
+ __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */
+ __le16 m_i_file_acl_high;
+ __u32 m_i_reserved2[2];
+ } masix2;
+ } osd2; /* OS dependent 2 */
+ __le16 i_extra_isize;
+ __le16 i_pad1;
+ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */
+ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */
+ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */
+ __le32 i_crtime; /* File Creation time */
+ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
+ __le32 i_version_hi; /* high 32 bits for 64-bit version */
+};
+
+struct move_extent {
+ __u32 reserved; /* should be zero */
+ __u32 donor_fd; /* donor file descriptor */
+ __u64 orig_start; /* logical start offset in block for orig */
+ __u64 donor_start; /* logical start offset in block for donor */
+ __u64 len; /* block length to be moved */
+ __u64 moved_len; /* moved block length */
+};
+
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \
+ ((offsetof(typeof(*ext4_inode), field) + \
+ sizeof((ext4_inode)->field)) \
+ <= (EXT4_GOOD_OLD_INODE_SIZE + \
+ (einode)->i_extra_isize)) \
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+ return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+ if (sizeof(time->tv_sec) > 4)
+ time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+ << 32;
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+do { \
+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
+ (raw_inode)->xtime ## _extra = \
+ ext4_encode_extra_time(&(inode)->xtime); \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
+ (raw_inode)->xtime ## _extra = \
+ ext4_encode_extra_time(&(einode)->xtime); \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
+do { \
+ (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \
+ ext4_decode_extra_time(&(inode)->xtime, \
+ raw_inode->xtime ## _extra); \
+ else \
+ (inode)->xtime.tv_nsec = 0; \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (einode)->xtime.tv_sec = \
+ (signed)le32_to_cpu((raw_inode)->xtime); \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
+ ext4_decode_extra_time(&(einode)->xtime, \
+ raw_inode->xtime ## _extra); \
+ else \
+ (einode)->xtime.tv_nsec = 0; \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
+#if defined(__KERNEL__) || defined(__linux__)
+#define i_reserved1 osd1.linux1.l_i_reserved1
+#define i_file_acl_high osd2.linux2.l_i_file_acl_high
+#define i_blocks_high osd2.linux2.l_i_blocks_high
+#define i_uid_low i_uid
+#define i_gid_low i_gid
+#define i_uid_high osd2.linux2.l_i_uid_high
+#define i_gid_high osd2.linux2.l_i_gid_high
+#define i_reserved2 osd2.linux2.l_i_reserved2
+
+#elif defined(__GNU__)
+
+#define i_translator osd1.hurd1.h_i_translator
+#define i_uid_high osd2.hurd2.h_i_uid_high
+#define i_gid_high osd2.hurd2.h_i_gid_high
+#define i_author osd2.hurd2.h_i_author
+
+#elif defined(__masix__)
+
+#define i_reserved1 osd1.masix1.m_i_reserved1
+#define i_file_acl_high osd2.masix2.m_i_file_acl_high
+#define i_reserved2 osd2.masix2.m_i_reserved2
+
+#endif /* defined(__KERNEL__) || defined(__linux__) */
+
+/*
+ * storage for cached extent
+ * If ec_len == 0, then the cache is invalid.
+ * If ec_start == 0, then the cache represents a gap (null mapping)
+ */
+struct ext4_ext_cache {
+ ext4_fsblk_t ec_start;
+ ext4_lblk_t ec_block;
+ __u32 ec_len; /* must be 32bit to return holes */
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+ __le32 i_data[15]; /* unconverted */
+ __u32 i_dtime;
+ ext4_fsblk_t i_file_acl;
+
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+ * it is ued for making block allocation decisions - we try to
+ * place a file's data blocks near its inode block, and new inodes
+ * near to their parent directory's inode.
+ */
+ ext4_group_t i_block_group;
+ ext4_lblk_t i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
+ unsigned long i_state_flags; /* Dynamic state flags */
+#endif
+ unsigned long i_flags;
+
+#ifdef CONFIG_EXT4_FS_XATTR
+ /*
+ * Extended attributes can be read independently of the main file
+ * data. Taking i_mutex even when reading would cause contention
+ * between readers of EAs and writers of regular file data, so
+ * instead we synchronize on xattr_sem when reading or changing
+ * EAs.
+ */
+ struct rw_semaphore xattr_sem;
+#endif
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext4_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /*
+ * i_data_sem is for serialising ext4_truncate() against
+ * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext4 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have i_data_sem.
+ */
+ struct rw_semaphore i_data_sem;
+ struct inode vfs_inode;
+ struct jbd2_inode *jinode;
+
+ struct ext4_ext_cache i_cached_extent;
+ /*
+ * File creation time. Its function is same as that of
+ * struct timespec i_{a,c,m}time in the generic inode.
+ */
+ struct timespec i_crtime;
+
+ /* mballoc */
+ struct list_head i_prealloc_list;
+ spinlock_t i_prealloc_lock;
+
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
+ /* allocation reservation info for delalloc */
+ /* In case of bigalloc, these refer to clusters rather than blocks */
+ unsigned int i_reserved_data_blocks;
+ unsigned int i_reserved_meta_blocks;
+ unsigned int i_allocated_meta_blocks;
+ ext4_lblk_t i_da_metadata_calc_last_lblock;
+ int i_da_metadata_calc_len;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
+#ifdef CONFIG_QUOTA
+ /* quota space reservation, managed internally by quota code */
+ qsize_t i_reserved_quota;
+#endif
+
+ /* completed IOs that might need unwritten extents handling */
+ struct list_head i_completed_io_list;
+ spinlock_t i_completed_io_lock;
+ atomic_t i_ioend_count; /* Number of outstanding io_end structs */
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
+ atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+
+ spinlock_t i_block_reservation_lock;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
+};
+
+/*
+ * File system states
+ */
+#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
+#define EXT4_ERROR_FS 0x0002 /* Errors detected */
+#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */
+
+/*
+ * Mount flags
+ */
+#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
+#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
+#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */
+#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */
+#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK 0x00070
+#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
+#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
+#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
+#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
+#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */
+#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */
+#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
+#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
+#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
+#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
+#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
+#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
+#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */
+
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly
+ specified delalloc */
+
+#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
+ ~EXT4_MOUNT_##opt
+#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
+ EXT4_MOUNT_##opt
+#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
+ EXT4_MOUNT_##opt)
+
+#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \
+ ~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \
+ EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
+ EXT4_MOUNT2_##opt)
+
+#define ext4_test_and_set_bit __test_and_set_bit_le
+#define ext4_set_bit __set_bit_le
+#define ext4_set_bit_atomic ext2_set_bit_atomic
+#define ext4_test_and_clear_bit __test_and_clear_bit_le
+#define ext4_clear_bit __clear_bit_le
+#define ext4_clear_bit_atomic ext2_clear_bit_atomic
+#define ext4_test_bit test_bit_le
+#define ext4_find_next_zero_bit find_next_zero_bit_le
+#define ext4_find_next_bit find_next_bit_le
+
+extern void ext4_set_bits(void *bm, int cur, int len);
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */
+#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */
+#define EXT4_ERRORS_RO 2 /* Remount fs read-only */
+#define EXT4_ERRORS_PANIC 3 /* Panic */
+#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE
+
+/*
+ * Structure of the super block
+ */
+struct ext4_super_block {
+/*00*/ __le32 s_inodes_count; /* Inodes count */
+ __le32 s_blocks_count_lo; /* Blocks count */
+ __le32 s_r_blocks_count_lo; /* Reserved blocks count */
+ __le32 s_free_blocks_count_lo; /* Free blocks count */
+/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
+ __le32 s_first_data_block; /* First Data Block */
+ __le32 s_log_block_size; /* Block size */
+ __le32 s_log_cluster_size; /* Allocation cluster size */
+/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */
+ __le32 s_clusters_per_group; /* # Clusters per group */
+ __le32 s_inodes_per_group; /* # Inodes per group */
+ __le32 s_mtime; /* Mount time */
+/*30*/ __le32 s_wtime; /* Write time */
+ __le16 s_mnt_count; /* Mount count */
+ __le16 s_max_mnt_count; /* Maximal mount count */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_state; /* File system state */
+ __le16 s_errors; /* Behaviour when detecting errors */
+ __le16 s_minor_rev_level; /* minor revision level */
+/*40*/ __le32 s_lastcheck; /* time of last check */
+ __le32 s_checkinterval; /* max. time between checks */
+ __le32 s_creator_os; /* OS */
+ __le32 s_rev_level; /* Revision level */
+/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */
+ __le16 s_def_resgid; /* Default gid for reserved blocks */
+ /*
+ * These fields are for EXT4_DYNAMIC_REV superblocks only.
+ *
+ * Note: the difference between the compatible feature set and
+ * the incompatible feature set is that if there is a bit set
+ * in the incompatible feature set that the kernel doesn't
+ * know about, it should refuse to mount the filesystem.
+ *
+ * e2fsck's requirements are more strict; if it doesn't know
+ * about a feature in either the compatible or incompatible
+ * feature set, it must abort and not try to meddle with
+ * things it doesn't understand...
+ */
+ __le32 s_first_ino; /* First non-reserved inode */
+ __le16 s_inode_size; /* size of inode structure */
+ __le16 s_block_group_nr; /* block group # of this superblock */
+ __le32 s_feature_compat; /* compatible feature set */
+/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
+ __le32 s_feature_ro_compat; /* readonly-compatible feature set */
+/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
+/*78*/ char s_volume_name[16]; /* volume name */
+/*88*/ char s_last_mounted[64]; /* directory where last mounted */
+/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
+ /*
+ * Performance hints. Directory preallocation should only
+ * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ */
+ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/
+ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */
+ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */
+ /*
+ * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
+ */
+/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */
+/*E0*/ __le32 s_journal_inum; /* inode number of journal file */
+ __le32 s_journal_dev; /* device number of journal file */
+ __le32 s_last_orphan; /* start of list of inodes to delete */
+ __le32 s_hash_seed[4]; /* HTREE hash seed */
+ __u8 s_def_hash_version; /* Default hash version to use */
+ __u8 s_jnl_backup_type;
+ __le16 s_desc_size; /* size of group descriptor */
+/*100*/ __le32 s_default_mount_opts;
+ __le32 s_first_meta_bg; /* First metablock block group */
+ __le32 s_mkfs_time; /* When the filesystem was created */
+ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */
+ /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/ __le32 s_blocks_count_hi; /* Blocks count */
+ __le32 s_r_blocks_count_hi; /* Reserved blocks count */
+ __le32 s_free_blocks_count_hi; /* Free blocks count */
+ __le16 s_min_extra_isize; /* All inodes have at least # bytes */
+ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
+ __le32 s_flags; /* Miscellaneous flags */
+ __le16 s_raid_stride; /* RAID stride */
+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
+ __le64 s_mmp_block; /* Block for multi-mount protection */
+ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad;
+ __le16 s_reserved_pad;
+ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
+ __le32 s_snapshot_inum; /* Inode number of active snapshot */
+ __le32 s_snapshot_id; /* sequential ID of active snapshot */
+ __le64 s_snapshot_r_blocks_count; /* reserved blocks for active
+ snapshot's future use */
+ __le32 s_snapshot_list; /* inode number of the head of the
+ on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+ __le32 s_error_count; /* number of fs errors */
+ __le32 s_first_error_time; /* first time an error happened */
+ __le32 s_first_error_ino; /* inode involved in first error */
+ __le64 s_first_error_block; /* block involved of first error */
+ __u8 s_first_error_func[32]; /* function where the error happened */
+ __le32 s_first_error_line; /* line number where error happened */
+ __le32 s_last_error_time; /* most recent time of an error */
+ __le32 s_last_error_ino; /* inode involved in last error */
+ __le32 s_last_error_line; /* line number where error happened */
+ __le64 s_last_error_block; /* block involved of last error */
+ __u8 s_last_error_func[32]; /* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+ __u8 s_mount_opts[64];
+ __le32 s_usr_quota_inum; /* inode for tracking user quota */
+ __le32 s_grp_quota_inum; /* inode for tracking group quota */
+ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
+ __le32 s_reserved[109]; /* Padding to the end of the block */
+};
+
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
+
+#ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED 0x0001
+#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+ unsigned long s_desc_size; /* Size of a group descriptor in bytes */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_clusters_per_group; /* Number of clusters in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ unsigned int s_cluster_ratio; /* Number of blocks per cluster */
+ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */
+ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head **s_group_desc;
+ unsigned int s_mount_opt;
+ unsigned int s_mount_opt2;
+ unsigned int s_mount_flags;
+ unsigned int s_def_mount_opt;
+ ext4_fsblk_t s_sb_block;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ unsigned int s_inode_readahead_blks;
+ unsigned int s_inode_goal;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ u32 s_hash_seed[4];
+ int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ struct percpu_counter s_freeclusters_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct percpu_counter s_dirtyclusters_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+ struct proc_dir_entry *s_proc;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
+
+ /* Journaling */
+ struct journal_s *s_journal;
+ struct list_head s_orphan;
+ struct mutex s_orphan_lock;
+ unsigned long s_resize_flags; /* Flags indicating if there
+ is a resizer */
+ unsigned long s_commit_interval;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
+ struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+ struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+ /* ext4 extents stats */
+ unsigned long s_ext_min;
+ unsigned long s_ext_max;
+ unsigned long s_depth_max;
+ spinlock_t s_ext_stats_lock;
+ unsigned long s_ext_blocks;
+ unsigned long s_ext_extents;
+#endif
+
+ /* for buddy allocator */
+ struct ext4_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ spinlock_t s_md_lock;
+ unsigned short *s_mb_offsets;
+ unsigned int *s_mb_maxs;
+
+ /* tunables */
+ unsigned long s_stripe;
+ unsigned int s_mb_stream_request;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
+ unsigned int s_max_writeback_mb_bump;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+ unsigned long s_mb_last_start;
+
+ /* stats for buddy allocator */
+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
+ atomic_t s_bal_success; /* we found long enough chunks */
+ atomic_t s_bal_allocated; /* in blocks */
+ atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_breaks; /* too long searches */
+ atomic_t s_bal_2orders; /* 2^order hits */
+ spinlock_t s_bal_lock;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
+ atomic_t s_mb_lost_chunks;
+ atomic_t s_mb_preallocated;
+ atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
+
+ /* locality groups */
+ struct ext4_locality_group __percpu *s_locality_groups;
+
+ /* for write statistics */
+ unsigned long s_sectors_written_start;
+ u64 s_kbytes_written;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
+
+ /* timer for periodic error stats printing */
+ struct timer_list s_err_report;
+
+ /* Lazy inode table initialization info */
+ struct ext4_li_request *s_li_request;
+ /* Wait multiplier for lazy initialization thread */
+ unsigned int s_li_wait_mult;
+
+ /* Kernel thread for multiple mount protection */
+ struct task_struct *s_mmp_tsk;
+
+ /* record the last minlen when FITRIM is called. */
+ atomic_t s_last_trim_minblks;
+};
+
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
+{
+ return container_of(inode, struct ext4_inode_info, vfs_inode);
+}
+
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+{
+ return ino == EXT4_ROOT_INO ||
+ ino == EXT4_JOURNAL_INO ||
+ ino == EXT4_RESIZE_INO ||
+ (ino >= EXT4_FIRST_INO(sb) &&
+ ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+}
+
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+ struct ext4_io_end *io_end)
+{
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ io_end->flag |= EXT4_IO_END_UNWRITTEN;
+ atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ }
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT4_STATE_JDATA, /* journaled data exists */
+ EXT4_STATE_NEW, /* inode is newly created */
+ EXT4_STATE_XATTR, /* has in-inode xattrs */
+ EXT4_STATE_NO_EXPAND, /* No space for expansion */
+ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
+ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ EXT4_STATE_NEWENTRY, /* File just added to dir */
+ EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field, offset) \
+static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
+{ \
+ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
+{ \
+ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{ \
+ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
+}
+
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+ (ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+ /* We depend on the fact that callers will set i_flags */
+}
+#endif
+#else
+/* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+ * macros from user land. */
+#define EXT4_SB(sb) (sb)
+#endif
+
+#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT4_OS_LINUX 0
+#define EXT4_OS_HURD 1
+#define EXT4_OS_MASIX 2
+#define EXT4_OS_FREEBSD 3
+#define EXT4_OS_LITES 4
+
+/*
+ * Revision levels
+ */
+#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */
+#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
+
+#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV
+#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV
+
+#define EXT4_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
+ ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
+ ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
+ ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
+#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \
+ EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001
+#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002
+#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004
+#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008
+#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
+#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
+
+#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
+#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
+#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004
+#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
+
+#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
+#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
+#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
+#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080
+#define EXT4_FEATURE_INCOMPAT_MMP 0x0100
+#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
+
+#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG| \
+ EXT4_FEATURE_INCOMPAT_EXTENTS| \
+ EXT4_FEATURE_INCOMPAT_64BIT| \
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_MMP)
+#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define EXT4_DEF_RESUID 0
+#define EXT4_DEF_RESGID 0
+
+#define EXT4_DEF_INODE_READAHEAD_BLKS 32
+
+/*
+ * Default mount options
+ */
+#define EXT4_DEFM_DEBUG 0x0001
+#define EXT4_DEFM_BSDGROUPS 0x0002
+#define EXT4_DEFM_XATTR_USER 0x0004
+#define EXT4_DEFM_ACL 0x0008
+#define EXT4_DEFM_UID16 0x0010
+#define EXT4_DEFM_JMODE 0x0060
+#define EXT4_DEFM_JMODE_DATA 0x0020
+#define EXT4_DEFM_JMODE_ORDERED 0x0040
+#define EXT4_DEFM_JMODE_WBACK 0x0060
+#define EXT4_DEFM_NOBARRIER 0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD 0x0400
+#define EXT4_DEFM_NODELALLOC 0x0800
+
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME 0
+#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT4_NAME_LEN 255
+
+struct ext4_dir_entry {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __le16 name_len; /* Name length */
+ char name[EXT4_NAME_LEN]; /* File name */
+};
+
+/*
+ * The new version of the directory entry. Since EXT4 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext4_dir_entry_2 {
+ __le32 inode; /* Inode number */
+ __le16 rec_len; /* Directory entry length */
+ __u8 name_len; /* Name length */
+ __u8 file_type;
+ char name[EXT4_NAME_LEN]; /* File name */
+};
+
+/*
+ * Ext4 directory file types. Only the low 3 bits are used. The
+ * other bits are reserved for now.
+ */
+#define EXT4_FT_UNKNOWN 0
+#define EXT4_FT_REG_FILE 1
+#define EXT4_FT_DIR 2
+#define EXT4_FT_CHRDEV 3
+#define EXT4_FT_BLKDEV 4
+#define EXT4_FT_FIFO 5
+#define EXT4_FT_SOCK 6
+#define EXT4_FT_SYMLINK 7
+
+#define EXT4_FT_MAX 8
+
+/*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT4_DIR_PAD 4
+#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1)
+#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \
+ ~EXT4_DIR_ROUND)
+#define EXT4_MAX_REC_LEN ((1<<16)-1)
+
+/*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len == EXT4_MAX_REC_LEN || len == 0)
+ return blocksize;
+ return (len & 65532) | ((len & 3) << 16);
+#else
+ return len;
+#endif
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+ if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+ BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+ if (len < 65536)
+ return cpu_to_le16(len);
+ if (len == blocksize) {
+ if (blocksize == 65536)
+ return cpu_to_le16(EXT4_MAX_REC_LEN);
+ else
+ return cpu_to_le16(0);
+ }
+ return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+ return cpu_to_le16(len);
+#endif
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+ EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY 0
+#define DX_HASH_HALF_MD4 1
+#define DX_HASH_TEA 2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED 4
+#define DX_HASH_TEA_UNSIGNED 5
+
+#ifdef __KERNEL__
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+ u32 hash;
+ u32 minor_hash;
+ int hash_version;
+ u32 *seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext4_htree_next_block
+ */
+#define HASH_NB_ALWAYS 1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext4_iloc
+{
+ struct buffer_head *bh;
+ unsigned long offset;
+ ext4_group_t block_group;
+};
+
+static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
+{
+ return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories. It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+ struct rb_root root;
+ struct rb_node *curr_node;
+ struct fname *extra_fname;
+ loff_t last_pos;
+ __u32 curr_hash;
+ __u32 curr_minor_hash;
+ __u32 next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext4_fsblk_t
+ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
+{
+ return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR -75000
+
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+ ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
+
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT 10
+#define EXT4_DEF_LI_MAX_START_DELAY 5
+#define EXT4_LAZYINIT_QUIT 0x0001
+#define EXT4_LAZYINIT_RUNNING 0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+ unsigned long li_state;
+ struct list_head li_request_list;
+ struct mutex li_list_mtx;
+};
+
+struct ext4_li_request {
+ struct super_block *lr_super;
+ struct ext4_sb_info *lr_sbi;
+ ext4_group_t lr_next_group;
+ struct list_head lr_request;
+ unsigned long lr_next_sched;
+ unsigned long lr_timeout;
+};
+
+struct ext4_features {
+ struct kobject f_kobj;
+ struct completion f_kobj_unregister;
+};
+
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+ __le32 mmp_magic; /* Magic number for MMP */
+ __le32 mmp_seq; /* Sequence no. updated periodically */
+
+ /*
+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+ * purposes and do not affect the correctness of the algorithm
+ */
+ __le64 mmp_time; /* Time last updated */
+ char mmp_nodename[64]; /* Node which last updated MMP block */
+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */
+
+ /*
+ * mmp_check_interval is used to verify if the MMP block has been
+ * updated on the block device. The value is updated based on the
+ * maximum time to write the MMP block during an update cycle.
+ */
+ __le16 mmp_check_interval;
+
+ __le16 mmp_pad1;
+ __le32 mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+ struct buffer_head *bh; /* bh from initial read_mmp_block() */
+ struct super_block *sb; /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT 2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext4 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE /**/
+# define ATTRIB_NORET __attribute__((noreturn))
+# define NORET_AND noreturn,
+
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+
+/* balloc.c */
+extern unsigned int ext4_block_group(struct super_block *sb,
+ ext4_fsblk_t blocknr);
+extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
+ ext4_fsblk_t blocknr);
+extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
+extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
+ ext4_group_t group);
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal,
+ unsigned int flags,
+ unsigned long *count,
+ int *errp);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+ s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
+extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+ ext4_group_t block_group,
+ struct buffer_head ** bh);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+ ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group);
+extern void ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t group,
+ struct ext4_group_desc *desc);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp);
+extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp);
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+
+/* dir.c */
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+ struct file *,
+ struct ext4_dir_entry_2 *,
+ struct buffer_head *, unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
+ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+ (de), (bh), (offset)))
+extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext4_dir_entry_2 *dirent);
+extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+
+/* fsync.c */
+extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_flush_completed_IO(struct inode *);
+
+/* hash.c */
+extern int ext4fs_dirhash(const char *name, int len, struct
+ dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t,
+ const struct qstr *qstr, __u32 goal,
+ uid_t *owner);
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+ ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+ struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_discard_preallocations(struct inode *);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags);
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
+/* inode.c */
+struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+ ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *,
+ ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+
+extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern int ext4_write_inode(struct inode *, struct writeback_control *);
+extern int ext4_setattr(struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
+extern int ext4_sync_inode(handle_t *, struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
+extern int ext4_change_inode_journal_flag(struct inode *, int);
+extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
+extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
+extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
+extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_discard_partial_page_buffers(handle_t *handle,
+ struct address_space *mapping, loff_t from,
+ loff_t length, int flags);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim);
+
+/* indirect.c */
+extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs);
+extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
+extern void ext4_ind_truncate(struct inode *inode);
+
+/* ioctl.c */
+extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* migrate.c */
+extern int ext4_ext_migrate(struct inode *);
+
+/* namei.c */
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash);
+
+/* resize.c */
+extern int ext4_group_add(struct super_block *sb,
+ struct ext4_new_group_data *input);
+extern int ext4_group_extend(struct super_block *sb,
+ struct ext4_super_block *es,
+ ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+
+/* super.c */
+extern void *ext4_kvmalloc(size_t size, gfp_t flags);
+extern void *ext4_kvzalloc(size_t size, gfp_t flags);
+extern void ext4_kvfree(void *ptr);
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
+ __LINE__, ## message)
+extern __printf(5, 6)
+void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern __printf(5, 6)
+void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+ const char *, ...);
+extern void __ext4_std_error(struct super_block *, const char *,
+ unsigned int, int);
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
+ __LINE__, ## message)
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+ const char *, ...);
+#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
+ __LINE__, ## message)
+extern __printf(3, 4)
+void ext4_msg(struct super_block *, const char *, const char *, ...);
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+ const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
+ __LINE__, msg)
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+ struct super_block *, ext4_group_t,
+ unsigned long, ext4_fsblk_t,
+ const char *, ...);
+#define ext4_grp_locked_error(sb, grp, message...) \
+ __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
+ __u32 compat);
+extern int ext4_update_rocompat_feature(handle_t *handle,
+ struct super_block *sb, __u32 rocompat);
+extern int ext4_update_incompat_feature(handle_t *handle,
+ struct super_block *sb, __u32 incompat);
+extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern void ext4_block_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_table_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg,
+ __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+ struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+ struct ext4_group_desc *gdp);
+
+static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
+{
+ return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+ le32_to_cpu(es->s_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
+{
+ return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
+ le32_to_cpu(es->s_r_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
+{
+ return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
+ le32_to_cpu(es->s_free_blocks_count_lo);
+}
+
+static inline void ext4_blocks_count_set(struct ext4_super_block *es,
+ ext4_fsblk_t blk)
+{
+ es->s_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
+ ext4_fsblk_t blk)
+{
+ es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+ ext4_fsblk_t blk)
+{
+ es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+ es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+{
+ if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+ le32_to_cpu(raw_inode->i_size_lo);
+ else
+ return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+}
+
+static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+{
+ raw_inode->i_size_lo = cpu_to_le32(i_size);
+ raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
+}
+
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info ***grp_info;
+ long indexv, indexh;
+ grp_info = EXT4_SB(sb)->s_group_info;
+ indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+ indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+ return grp_info[indexv][indexh];
+}
+
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards. See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+
+ smp_rmb();
+ return ngroups;
+}
+
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
+#define ext4_std_error(sb, errno) \
+do { \
+ if ((errno)) \
+ __ext4_std_error((sb), __func__, __LINE__, (errno)); \
+} while (0)
+
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
+ * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#else
+#define EXT4_FREECLUSTERS_WATERMARK 0
+#endif
+
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (newsize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return ;
+}
+
+struct ext4_group_info {
+ unsigned long bb_state;
+ struct rb_root bb_free_root;
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ struct list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+#endif
+ struct rw_semaphore alloc_sem;
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+ * 5 free 8-block regions. */
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
+#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
+
+#define EXT4_MB_GRP_NEED_INIT(grp) \
+ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
+ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_SET_TRIMMED(grp) \
+ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
+ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+ ext4_group_t group)
+{
+ return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
+
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+ ext4_group_t group)
+{
+ spin_unlock(ext4_group_lock_ptr(sb, group));
+}
+
+static inline void ext4_mark_super_dirty(struct super_block *sb)
+{
+ if (EXT4_SB(sb)->s_journal == NULL)
+ sb->s_dirt =1;
+}
+
+/*
+ * Block validity checking
+ */
+#define ext4_check_indirect_blockref(inode, bh) \
+ ext4_check_blockref(__func__, __LINE__, inode, \
+ (__le32 *)(bh)->b_data, \
+ EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_ind_check_inode(inode) \
+ ext4_check_blockref(__func__, __LINE__, inode, \
+ EXT4_I(inode)->i_data, \
+ EXT4_NDIR_BLOCKS)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext4_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext4_file_inode_operations;
+extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+
+/* namei.c */
+extern const struct inode_operations ext4_dir_inode_operations;
+extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
+
+/* symlink.c */
+extern const struct inode_operations ext4_symlink_inode_operations;
+extern const struct inode_operations ext4_fast_symlink_inode_operations;
+
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
+extern int ext4_check_blockref(const char *, unsigned int,
+ struct inode *, __le32 *, unsigned int);
+
+/* extents.c */
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+ int chunk);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+ loff_t length);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ __u64 start_orig, __u64 start_donor,
+ __u64 len, __u64 *moved_len);
+
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
+extern void ext4_free_io_end(ext4_io_end_t *io);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern int ext4_end_io_nolock(ext4_io_end_t *io);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+ struct page *page,
+ int len,
+ struct writeback_control *wbc);
+
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+ BH_Uninit /* blocks are allocated but uninitialized on disk */
+ = BH_JBDPrivateStart,
+ BH_AllocFromCluster, /* allocated blocks were part of already
+ * allocated cluster. Note that this flag will
+ * never, ever appear in a buffer_head's state
+ * flag. See EXT4_MAP_FROM_CLUSTER to see where
+ * this is used. */
+ BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
+ * flag is set when ext4_map_blocks is called on a
+ * delayed allocated block to get its real mapping. */
+};
+
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
+BUFFER_FNS(Da_Mapped, da_mapped)
+
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+ return (buffer_uptodate(bh) &&
+ test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+ set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ 37
+#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
+ EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+#define EXT4_RESIZING 0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
+
+#endif /* __KERNEL__ */
+
+#include "ext4_extents.h"
+
+#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
new file mode 100644
index 00000000..0f58b86e
--- /dev/null
+++ b/fs/ext4/ext4_extents.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+#ifndef _EXT4_EXTENTS
+#define _EXT4_EXTENTS
+
+#include "ext4.h"
+
+/*
+ * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
+ * becomes very small, so index split, in-depth growing and
+ * other hard changes happen much more often.
+ * This is for debug purposes only.
+ */
+#define AGGRESSIVE_TEST_
+
+/*
+ * With EXTENTS_STATS defined, the number of blocks and extents
+ * are collected in the truncate path. They'll be shown at
+ * umount time.
+ */
+#define EXTENTS_STATS__
+
+/*
+ * If CHECK_BINSEARCH is defined, then the results of the binary search
+ * will also be checked by linear search.
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * If EXT_STATS is defined then stats numbers are collected.
+ * These number will be displayed at umount time.
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total).
+ * The first 12 bytes store ext4_extent_header;
+ * the remainder stores an array of ext4_extent.
+ */
+
+/*
+ * This is the extent on-disk structure.
+ * It's used at the bottom of the tree.
+ */
+struct ext4_extent {
+ __le32 ee_block; /* first logical block extent covers */
+ __le16 ee_len; /* number of blocks covered by extent */
+ __le16 ee_start_hi; /* high 16 bits of physical block */
+ __le32 ee_start_lo; /* low 32 bits of physical block */
+};
+
+/*
+ * This is index on-disk structure.
+ * It's used at all the levels except the bottom.
+ */
+struct ext4_extent_idx {
+ __le32 ei_block; /* index covers logical blocks from 'block' */
+ __le32 ei_leaf_lo; /* pointer to the physical block of the next *
+ * level. leaf or next index could be there */
+ __le16 ei_leaf_hi; /* high 16 bits of physical block */
+ __u16 ei_unused;
+};
+
+/*
+ * Each block (leaves and indexes), even inode-stored has header.
+ */
+struct ext4_extent_header {
+ __le16 eh_magic; /* probably will support different formats */
+ __le16 eh_entries; /* number of valid entries */
+ __le16 eh_max; /* capacity of store in entries */
+ __le16 eh_depth; /* has tree real underlying blocks? */
+ __le32 eh_generation; /* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
+
+/*
+ * Array of ext4_ext_path contains path to some extent.
+ * Creation/lookup routines use it for traversal/splitting/etc.
+ * Truncate uses it to simulate recursive walking.
+ */
+struct ext4_ext_path {
+ ext4_fsblk_t p_block;
+ __u16 p_depth;
+ struct ext4_extent *p_ext;
+ struct ext4_extent_idx *p_idx;
+ struct ext4_extent_header *p_hdr;
+ struct buffer_head *p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
+ struct ext4_ext_cache *,
+ struct ext4_extent *, void *);
+
+#define EXT_CONTINUE 0
+#define EXT_BREAK 1
+#define EXT_REPEAT 2
+
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS 0xffffffff
+
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an uninitialized (i.e.
+ * preallocated).
+ * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
+ * uninitialized extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * uninitialized one. In other words, if MSB of ee_len is set, it is an
+ * uninitialized extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an uninitialized extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN (1UL << 15)
+#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+ ((struct ext4_extent *) (((char *) (__hdr__)) + \
+ sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \
+ sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+ (le16_to_cpu((__path__)->p_hdr->eh_entries) \
+ < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+ return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+ return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+ return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void
+ext4_ext_invalidate_cache(struct inode *inode)
+{
+ EXT4_I(inode)->i_cached_extent.ec_len = 0;
+}
+
+static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+{
+ /* We can not have an uninitialized extent of zero length! */
+ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+ ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+{
+ /* Extent with ee_len of 0x8000 is treated as an initialized extent */
+ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+ return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+ le16_to_cpu(ext->ee_len) :
+ (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+}
+
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+ ext4_fsblk_t block;
+
+ block = le32_to_cpu(ex->ee_start_lo);
+ block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+ return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+ ext4_fsblk_t block;
+
+ block = le32_to_cpu(ix->ei_leaf_lo);
+ block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+ return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+ ext4_fsblk_t pb)
+{
+ ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+ 0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+ ext4_fsblk_t pb)
+{
+ ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+ ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+ 0xffff);
+}
+
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+ int search_hint_reverse);
+#endif /* _EXT4_EXTENTS */
+
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
new file mode 100644
index 00000000..aca17901
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.c
@@ -0,0 +1,154 @@
+/*
+ * Interface between ext4 and JBD
+ */
+
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh)
+{
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_write_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__, bh,
+ handle, err);
+ }
+ return err;
+}
+
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+ int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr)
+{
+ int err;
+
+ might_sleep();
+
+ trace_ext4_forget(inode, is_metadata, blocknr);
+ BUFFER_TRACE(bh, "enter");
+
+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ "data mode %x\n",
+ bh, is_metadata, inode->i_mode,
+ test_opt(inode->i_sb, DATA_FLAGS));
+
+ /* In the no journal case, we can just do a bforget and return */
+ if (!ext4_handle_valid(handle)) {
+ bforget(bh);
+ return 0;
+ }
+
+ /* Never use the revoke function if we are doing full data
+ * journaling: there is no need to, and a V1 superblock won't
+ * support it. Otherwise, only skip the revoke on un-journaled
+ * data blocks. */
+
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (!is_metadata && !ext4_should_journal_data(inode))) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call jbd2_journal_forget");
+ err = jbd2_journal_forget(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ return err;
+ }
+ return 0;
+ }
+
+ /*
+ * data!=journal && (is_metadata || should_journal_data(inode))
+ */
+ BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+ err = jbd2_journal_revoke(handle, blocknr, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ __ext4_abort(inode->i_sb, where, line,
+ "error %d when attempting revoke", err);
+ }
+ BUFFER_TRACE(bh, "exit");
+ return err;
+}
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh)
+{
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_get_create_access(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ }
+ return err;
+}
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
+{
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_dirty_metadata(handle, bh);
+ if (err) {
+ /* Errors can only happen if there is a bug */
+ handle->h_err = err;
+ __ext4_journal_stop(where, line, handle);
+ }
+ } else {
+ if (inode)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
+ if (inode && inode_needs_sync(inode)) {
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh)) {
+ struct ext4_super_block *es;
+
+ es = EXT4_SB(inode->i_sb)->s_es;
+ es->s_last_error_block =
+ cpu_to_le64(bh->b_blocknr);
+ ext4_error_inode(inode, where, line,
+ bh->b_blocknr,
+ "IO error syncing itable block");
+ err = -EIO;
+ }
+ }
+ }
+ return err;
+}
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+ handle_t *handle, struct super_block *sb)
+{
+ struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+ int err = 0;
+
+ if (ext4_handle_valid(handle)) {
+ err = jbd2_journal_dirty_metadata(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, line, __func__,
+ bh, handle, err);
+ } else
+ sb->s_dirt = 1;
+ return err;
+}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
new file mode 100644
index 00000000..83b20fcf
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.h
@@ -0,0 +1,399 @@
+/*
+ * ext4_jbd2.h
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext4-specific journaling extensions.
+ */
+
+#ifndef _EXT4_JBD2_H
+#define _EXT4_JBD2_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify up to
+ * 5 levels of tree + root which are stored in the inode. */
+
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
+ (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
+ ? 27U : 8U)
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT4_XATTR_TRANS_BLOCKS 6U
+
+/* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+ EXT4_XATTR_TRANS_BLOCKS - 2 + \
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
+ * generous. We can grow the delete transaction later if necessary. */
+
+#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction. For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT4_MAX_TRANS_DATA 64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one. Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates. Quota allocations are not
+ * needed. */
+
+#define EXT4_RESERVE_TRANS_BLOCKS 12U
+
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+
+/**
+ * struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ * This struct is a 'seed' structure for a using with your own callback
+ * structs. If you are using callbacks you must allocate one of these
+ * or another struct of your own definition which has this struct
+ * as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+ /* list information for other callbacks attached to the same handle */
+ struct list_head jce_list;
+
+ /* Function to call with this callback structure */
+ void (*jce_func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int error);
+
+ /* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ * @sb: superblock of current filesystem for transaction
+ * @jce: returned journal callback data
+ * @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+ void (*func)(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc),
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ /* Add the jce to transaction's private list */
+ jce->jce_func = func;
+ spin_lock(&sbi->s_md_lock);
+ list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+ struct ext4_journal_cb_entry *jce)
+{
+ struct ext4_sb_info *sbi =
+ EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+ spin_lock(&sbi->s_md_lock);
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+}
+
+int
+ext4_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc);
+
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext4 calls into JBD.
+ */
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn,
+ struct buffer_head *bh, handle_t *handle, int err);
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+ int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr);
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+ handle_t *handle, struct buffer_head *bh);
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct buffer_head *bh);
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+ handle_t *handle, struct super_block *sb);
+
+#define ext4_journal_get_write_access(handle, bh) \
+ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
+ (bh), (block_nr))
+#define ext4_journal_get_create_access(handle, bh) \
+ __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+ (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+ __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
+
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+
+/* Note: Do not use this for NULL handles. This is only to determine if
+ * a properly allocated handle is using a journal or not. */
+static inline int ext4_handle_valid(handle_t *handle)
+{
+ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
+ return 0;
+ return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ handle->h_sync = 1;
+}
+
+static inline void ext4_handle_release_buffer(handle_t *handle,
+ struct buffer_head *bh)
+{
+ if (ext4_handle_valid(handle))
+ jbd2_journal_release_buffer(handle, bh);
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+ if (ext4_handle_valid(handle))
+ return is_handle_aborted(handle);
+ return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+ if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+ return 0;
+ return 1;
+}
+
+static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
+{
+ return ext4_journal_start_sb(inode->i_sb, nblocks);
+}
+
+#define ext4_journal_stop(handle) \
+ __ext4_journal_stop(__func__, __LINE__, (handle))
+
+static inline handle_t *ext4_journal_current_handle(void)
+{
+ return journal_current_handle();
+}
+
+static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_extend(handle, nblocks);
+ return 0;
+}
+
+static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_restart(handle, nblocks);
+ return 0;
+}
+
+static inline int ext4_journal_blocks_per_page(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) != NULL)
+ return jbd2_journal_blocks_per_page(inode);
+ return 0;
+}
+
+static inline int ext4_journal_force_commit(journal_t *journal)
+{
+ if (journal)
+ return jbd2_journal_force_commit(journal);
+ return 0;
+}
+
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ if (ext4_handle_valid(handle))
+ return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+ return 0;
+}
+
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
+/* super.c */
+int ext4_force_commit(struct super_block *sb);
+
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ else
+ BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
+}
+
+static inline int ext4_should_order_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
+}
+
+static inline int ext4_should_writeback_data(struct inode *inode)
+{
+ return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
+}
+
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads. This only works for extent-based
+ * files, and it doesn't work if data journaling is enabled, since the
+ * dioread_nolock code uses b_private to pass information back to the
+ * I/O completion handler, and this conflicts with the jbd's use of
+ * b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+ if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return 0;
+ if (ext4_should_journal_data(inode))
+ return 0;
+ return 1;
+}
+
+#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 00000000..abcdeab6
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,4866 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * Architecture independence:
+ * Copyright (c) 2005, Bull S.A.
+ * Written by Pierre Peiffer <pierre.peiffer@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+/*
+ * Extents support for EXT4
+ *
+ * TODO:
+ * - ext4*_error() should be used in some situations
+ * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
+ * - smart tree reduction
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/falloc.h>
+#include <asm/uaccess.h>
+#include <linux/fiemap.h>
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
+ due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags);
+
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags);
+
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
+{
+ int err;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (handle->h_buffer_credits > needed)
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err <= 0)
+ return err;
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ if (err == 0)
+ err = -EAGAIN;
+
+ return err;
+}
+
+/*
+ * could return:
+ * - EROFS
+ * - ENOMEM
+ */
+static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ if (path->p_bh) {
+ /* path points to block */
+ return ext4_journal_get_write_access(handle, path->p_bh);
+ }
+ /* path points to leaf/index in inode body */
+ /* we use in-core data, no need to protect them */
+ return 0;
+}
+
+/*
+ * could return:
+ * - EROFS
+ * - ENOMEM
+ * - EIO
+ */
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int err;
+ if (path->p_bh) {
+ /* path points to block */
+ err = __ext4_handle_dirty_metadata(where, line, handle,
+ inode, path->p_bh);
+ } else {
+ /* path points to leaf/index in inode body */
+ err = ext4_mark_inode_dirty(handle, inode);
+ }
+ return err;
+}
+
+static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t block)
+{
+ if (path) {
+ int depth = path->p_depth;
+ struct ext4_extent *ex;
+
+ /*
+ * Try to predict block placement assuming that we are
+ * filling in a file which will eventually be
+ * non-sparse --- i.e., in the case of libbfd writing
+ * an ELF object sections out-of-order but in a way
+ * the eventually results in a contiguous object or
+ * executable file, or some database extending a table
+ * space file. However, this is actually somewhat
+ * non-ideal if we are writing a sparse file such as
+ * qemu or KVM writing a raw image file that is going
+ * to stay fairly sparse, since it will end up
+ * fragmenting the file system's free space. Maybe we
+ * should have some hueristics or some way to allow
+ * userspace to pass a hint to file system,
+ * especially if the latter case turns out to be
+ * common.
+ */
+ ex = path[depth].p_ext;
+ if (ex) {
+ ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+ ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+ if (block > ext_block)
+ return ext_pblk + (block - ext_block);
+ else
+ return ext_pblk - (ext_block - block);
+ }
+
+ /* it looks like index is empty;
+ * try to find starting block from index itself */
+ if (path[depth].p_bh)
+ return path[depth].p_bh->b_blocknr;
+ }
+
+ /* OK. use inode's group */
+ return ext4_inode_to_goal_block(inode);
+}
+
+/*
+ * Allocation for a meta data block
+ */
+static ext4_fsblk_t
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex, int *err, unsigned int flags)
+{
+ ext4_fsblk_t goal, newblock;
+
+ goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+ newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+ NULL, err);
+ return newblock;
+}
+
+static inline int ext4_ext_space_block(struct inode *inode, int check)
+{
+ int size;
+
+ size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent);
+#ifdef AGGRESSIVE_TEST
+ if (!check && size > 6)
+ size = 6;
+#endif
+ return size;
+}
+
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
+{
+ int size;
+
+ size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent_idx);
+#ifdef AGGRESSIVE_TEST
+ if (!check && size > 5)
+ size = 5;
+#endif
+ return size;
+}
+
+static inline int ext4_ext_space_root(struct inode *inode, int check)
+{
+ int size;
+
+ size = sizeof(EXT4_I(inode)->i_data);
+ size -= sizeof(struct ext4_extent_header);
+ size /= sizeof(struct ext4_extent);
+#ifdef AGGRESSIVE_TEST
+ if (!check && size > 3)
+ size = 3;
+#endif
+ return size;
+}
+
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
+{
+ int size;
+
+ size = sizeof(EXT4_I(inode)->i_data);
+ size -= sizeof(struct ext4_extent_header);
+ size /= sizeof(struct ext4_extent_idx);
+#ifdef AGGRESSIVE_TEST
+ if (!check && size > 4)
+ size = 4;
+#endif
+ return size;
+}
+
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int idxs;
+
+ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent_idx));
+
+ /*
+ * If the new delayed allocation block is contiguous with the
+ * previous da block, it can share index blocks with the
+ * previous block, so we only need to allocate a new index
+ * block every idxs leaf blocks. At ldxs**2 blocks, we need
+ * an additional index block, and at ldxs**3 blocks, yet
+ * another index blocks.
+ */
+ if (ei->i_da_metadata_calc_len &&
+ ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ int num = 0;
+
+ if ((ei->i_da_metadata_calc_len % idxs) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+ num++;
+ ei->i_da_metadata_calc_len = 0;
+ } else
+ ei->i_da_metadata_calc_len++;
+ ei->i_da_metadata_calc_last_lblock++;
+ return num;
+ }
+
+ /*
+ * In the worst case we need a new set of index blocks at
+ * every level of the inode's extent tree.
+ */
+ ei->i_da_metadata_calc_len = 1;
+ ei->i_da_metadata_calc_last_lblock = lblock;
+ return ext_depth(inode) + 1;
+}
+
+static int
+ext4_ext_max_entries(struct inode *inode, int depth)
+{
+ int max;
+
+ if (depth == ext_depth(inode)) {
+ if (depth == 0)
+ max = ext4_ext_space_root(inode, 1);
+ else
+ max = ext4_ext_space_root_idx(inode, 1);
+ } else {
+ if (depth == 0)
+ max = ext4_ext_space_block(inode, 1);
+ else
+ max = ext4_ext_space_block_idx(inode, 1);
+ }
+
+ return max;
+}
+
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+ ext4_fsblk_t block = ext4_ext_pblock(ext);
+ int len = ext4_ext_get_actual_len(ext);
+
+ if (len == 0)
+ return 0;
+ return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+ struct ext4_extent_idx *ext_idx)
+{
+ ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
+
+ return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+ struct ext4_extent_header *eh,
+ int depth)
+{
+ unsigned short entries;
+ if (eh->eh_entries == 0)
+ return 1;
+
+ entries = le16_to_cpu(eh->eh_entries);
+
+ if (depth == 0) {
+ /* leaf entries */
+ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+ while (entries) {
+ if (!ext4_valid_extent(inode, ext))
+ return 0;
+ ext++;
+ entries--;
+ }
+ } else {
+ struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+ while (entries) {
+ if (!ext4_valid_extent_idx(inode, ext_idx))
+ return 0;
+ ext_idx++;
+ entries--;
+ }
+ }
+ return 1;
+}
+
+static int __ext4_ext_check(const char *function, unsigned int line,
+ struct inode *inode, struct ext4_extent_header *eh,
+ int depth)
+{
+ const char *error_msg;
+ int max = 0;
+
+ if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+ error_msg = "invalid magic";
+ goto corrupted;
+ }
+ if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
+ error_msg = "unexpected eh_depth";
+ goto corrupted;
+ }
+ if (unlikely(eh->eh_max == 0)) {
+ error_msg = "invalid eh_max";
+ goto corrupted;
+ }
+ max = ext4_ext_max_entries(inode, depth);
+ if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
+ error_msg = "too large eh_max";
+ goto corrupted;
+ }
+ if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+ error_msg = "invalid eh_entries";
+ goto corrupted;
+ }
+ if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ error_msg = "invalid extent entries";
+ goto corrupted;
+ }
+ return 0;
+
+corrupted:
+ ext4_error_inode(inode, function, line, 0,
+ "bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ error_msg, le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
+
+ return -EIO;
+}
+
+#define ext4_ext_check(inode, eh, depth) \
+ __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
+
+#ifdef EXT_DEBUG
+static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
+{
+ int k, l = path->p_depth;
+
+ ext_debug("path:");
+ for (k = 0; k <= l; k++, path++) {
+ if (path->p_idx) {
+ ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
+ ext4_idx_pblock(path->p_idx));
+ } else if (path->p_ext) {
+ ext_debug(" %d:[%d]%d:%llu ",
+ le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_get_actual_len(path->p_ext),
+ ext4_ext_pblock(path->p_ext));
+ } else
+ ext_debug(" []");
+ }
+ ext_debug("\n");
+}
+
+static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent_header *eh;
+ struct ext4_extent *ex;
+ int i;
+
+ if (!path)
+ return;
+
+ eh = path[depth].p_hdr;
+ ex = EXT_FIRST_EXTENT(eh);
+
+ ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
+ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
+ ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
+ }
+ ext_debug("\n");
+}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t newblock, int level)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+
+ if (depth != level) {
+ struct ext4_extent_idx *idx;
+ idx = path[level].p_idx;
+ while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+ ext_debug("%d: move %d:%llu in new index %llu\n", level,
+ le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx),
+ newblock);
+ idx++;
+ }
+
+ return;
+ }
+
+ ex = path[depth].p_ext;
+ while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ newblock);
+ ex++;
+ }
+}
+
+#else
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
+#endif
+
+void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth = path->p_depth;
+ int i;
+
+ for (i = 0; i <= depth; i++, path++)
+ if (path->p_bh) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
+}
+
+/*
+ * ext4_ext_binsearch_idx:
+ * binary search for the closest index of the given block
+ * the header must be checked before calling this
+ */
+static void
+ext4_ext_binsearch_idx(struct inode *inode,
+ struct ext4_ext_path *path, ext4_lblk_t block)
+{
+ struct ext4_extent_header *eh = path->p_hdr;
+ struct ext4_extent_idx *r, *l, *m;
+
+
+ ext_debug("binsearch for %u(idx): ", block);
+
+ l = EXT_FIRST_INDEX(eh) + 1;
+ r = EXT_LAST_INDEX(eh);
+ while (l <= r) {
+ m = l + (r - l) / 2;
+ if (block < le32_to_cpu(m->ei_block))
+ r = m - 1;
+ else
+ l = m + 1;
+ ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
+ m, le32_to_cpu(m->ei_block),
+ r, le32_to_cpu(r->ei_block));
+ }
+
+ path->p_idx = l - 1;
+ ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ ext4_idx_pblock(path->p_idx));
+
+#ifdef CHECK_BINSEARCH
+ {
+ struct ext4_extent_idx *chix, *ix;
+ int k;
+
+ chix = ix = EXT_FIRST_INDEX(eh);
+ for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
+ if (k != 0 &&
+ le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+ printk(KERN_DEBUG "k=%d, ix=0x%p, "
+ "first=0x%p\n", k,
+ ix, EXT_FIRST_INDEX(eh));
+ printk(KERN_DEBUG "%u <= %u\n",
+ le32_to_cpu(ix->ei_block),
+ le32_to_cpu(ix[-1].ei_block));
+ }
+ BUG_ON(k && le32_to_cpu(ix->ei_block)
+ <= le32_to_cpu(ix[-1].ei_block));
+ if (block < le32_to_cpu(ix->ei_block))
+ break;
+ chix = ix;
+ }
+ BUG_ON(chix != path->p_idx);
+ }
+#endif
+
+}
+
+/*
+ * ext4_ext_binsearch:
+ * binary search for closest extent of the given block
+ * the header must be checked before calling this
+ */
+static void
+ext4_ext_binsearch(struct inode *inode,
+ struct ext4_ext_path *path, ext4_lblk_t block)
+{
+ struct ext4_extent_header *eh = path->p_hdr;
+ struct ext4_extent *r, *l, *m;
+
+ if (eh->eh_entries == 0) {
+ /*
+ * this leaf is empty:
+ * we get such a leaf in split/add case
+ */
+ return;
+ }
+
+ ext_debug("binsearch for %u: ", block);
+
+ l = EXT_FIRST_EXTENT(eh) + 1;
+ r = EXT_LAST_EXTENT(eh);
+
+ while (l <= r) {
+ m = l + (r - l) / 2;
+ if (block < le32_to_cpu(m->ee_block))
+ r = m - 1;
+ else
+ l = m + 1;
+ ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
+ m, le32_to_cpu(m->ee_block),
+ r, le32_to_cpu(r->ee_block));
+ }
+
+ path->p_ext = l - 1;
+ ext_debug(" -> %d:%llu:[%d]%d ",
+ le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_pblock(path->p_ext),
+ ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_get_actual_len(path->p_ext));
+
+#ifdef CHECK_BINSEARCH
+ {
+ struct ext4_extent *chex, *ex;
+ int k;
+
+ chex = ex = EXT_FIRST_EXTENT(eh);
+ for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
+ BUG_ON(k && le32_to_cpu(ex->ee_block)
+ <= le32_to_cpu(ex[-1].ee_block));
+ if (block < le32_to_cpu(ex->ee_block))
+ break;
+ chex = ex;
+ }
+ BUG_ON(chex != path->p_ext);
+ }
+#endif
+
+}
+
+int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+
+ eh = ext_inode_hdr(inode);
+ eh->eh_depth = 0;
+ eh->eh_entries = 0;
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_ext_invalidate_cache(inode);
+ return 0;
+}
+
+struct ext4_ext_path *
+ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent_header *eh;
+ struct buffer_head *bh;
+ short int depth, i, ppos = 0, alloc = 0;
+
+ eh = ext_inode_hdr(inode);
+ depth = ext_depth(inode);
+
+ /* account possible depth increase */
+ if (!path) {
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+ GFP_NOFS);
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ alloc = 1;
+ }
+ path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
+
+ i = depth;
+ /* walk through the tree */
+ while (i) {
+ int need_to_validate = 0;
+
+ ext_debug("depth %d: num %d, max %d\n",
+ ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+
+ ext4_ext_binsearch_idx(inode, path + ppos, block);
+ path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+ path[ppos].p_depth = i;
+ path[ppos].p_ext = NULL;
+
+ bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+ if (unlikely(!bh))
+ goto err;
+ if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, block,
+ path[ppos].p_block);
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto err;
+ }
+ /* validate the extent entries */
+ need_to_validate = 1;
+ }
+ eh = ext_block_hdr(bh);
+ ppos++;
+ if (unlikely(ppos > depth)) {
+ put_bh(bh);
+ EXT4_ERROR_INODE(inode,
+ "ppos %d > depth %d", ppos, depth);
+ goto err;
+ }
+ path[ppos].p_bh = bh;
+ path[ppos].p_hdr = eh;
+ i--;
+
+ if (need_to_validate && ext4_ext_check(inode, eh, i))
+ goto err;
+ }
+
+ path[ppos].p_depth = i;
+ path[ppos].p_ext = NULL;
+ path[ppos].p_idx = NULL;
+
+ /* find extent */
+ ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+
+ ext4_ext_show_path(inode, path);
+
+ return path;
+
+err:
+ ext4_ext_drop_refs(path);
+ if (alloc)
+ kfree(path);
+ return ERR_PTR(-EIO);
+}
+
+/*
+ * ext4_ext_insert_index:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *curp,
+ int logical, ext4_fsblk_t ptr)
+{
+ struct ext4_extent_idx *ix;
+ int len, err;
+
+ err = ext4_ext_get_access(handle, inode, curp);
+ if (err)
+ return err;
+
+ if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d == ei_block %d!",
+ logical, le32_to_cpu(curp->p_idx->ei_block));
+ return -EIO;
+ }
+
+ if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+ >= le16_to_cpu(curp->p_hdr->eh_max))) {
+ EXT4_ERROR_INODE(inode,
+ "eh_entries %d >= eh_max %d!",
+ le16_to_cpu(curp->p_hdr->eh_entries),
+ le16_to_cpu(curp->p_hdr->eh_max));
+ return -EIO;
+ }
+
+ if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
+ /* insert after */
+ ext_debug("insert new index %d after: %llu\n", logical, ptr);
+ ix = curp->p_idx + 1;
+ } else {
+ /* insert before */
+ ext_debug("insert new index %d before: %llu\n", logical, ptr);
+ ix = curp->p_idx;
+ }
+
+ len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+ BUG_ON(len < 0);
+ if (len > 0) {
+ ext_debug("insert new index %d: "
+ "move %d indices from 0x%p to 0x%p\n",
+ logical, len, ix, ix + 1);
+ memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+ }
+
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EIO;
+ }
+
+ ix->ei_block = cpu_to_le32(logical);
+ ext4_idx_store_pblock(ix, ptr);
+ le16_add_cpu(&curp->p_hdr->eh_entries, 1);
+
+ if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+ return -EIO;
+ }
+
+ err = ext4_ext_dirty(handle, inode, curp);
+ ext4_std_error(inode->i_sb, err);
+
+ return err;
+}
+
+/*
+ * ext4_ext_split:
+ * inserts new subtree into the path, using free index entry
+ * at depth @at:
+ * - allocates all needed blocks (new leaf and all intermediate index blocks)
+ * - makes decision where to split
+ * - moves remaining extents and index entries (right to the split point)
+ * into the newly allocated blocks
+ * - initializes subtree
+ */
+static int ext4_ext_split(handle_t *handle, struct inode *inode,
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int at)
+{
+ struct buffer_head *bh = NULL;
+ int depth = ext_depth(inode);
+ struct ext4_extent_header *neh;
+ struct ext4_extent_idx *fidx;
+ int i = at, k, m, a;
+ ext4_fsblk_t newblock, oldblock;
+ __le32 border;
+ ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+ int err = 0;
+
+ /* make decision: where to split? */
+ /* FIXME: now decision is simplest: at current extent */
+
+ /* if current leaf will be split, then we should use
+ * border from split point */
+ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+ return -EIO;
+ }
+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ border = path[depth].p_ext[1].ee_block;
+ ext_debug("leaf will be split."
+ " next leaf starts at %d\n",
+ le32_to_cpu(border));
+ } else {
+ border = newext->ee_block;
+ ext_debug("leaf will be added."
+ " next leaf starts at %d\n",
+ le32_to_cpu(border));
+ }
+
+ /*
+ * If error occurs, then we break processing
+ * and mark filesystem read-only. index won't
+ * be inserted and tree will be in consistent
+ * state. Next mount will repair buffers too.
+ */
+
+ /*
+ * Get array to track all allocated blocks.
+ * We need this to handle errors and free blocks
+ * upon them.
+ */
+ ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+ if (!ablocks)
+ return -ENOMEM;
+
+ /* allocate all needed blocks */
+ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+ for (a = 0; a < depth - at; a++) {
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err, flags);
+ if (newblock == 0)
+ goto cleanup;
+ ablocks[a] = newblock;
+ }
+
+ /* initialize new leaf */
+ newblock = ablocks[--a];
+ if (unlikely(newblock == 0)) {
+ EXT4_ERROR_INODE(inode, "newblock == 0!");
+ err = -EIO;
+ goto cleanup;
+ }
+ bh = sb_getblk(inode->i_sb, newblock);
+ if (!bh) {
+ err = -EIO;
+ goto cleanup;
+ }
+ lock_buffer(bh);
+
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err)
+ goto cleanup;
+
+ neh = ext_block_hdr(bh);
+ neh->eh_entries = 0;
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
+ neh->eh_magic = EXT4_EXT_MAGIC;
+ neh->eh_depth = 0;
+
+ /* move remainder of path[depth] to the new leaf */
+ if (unlikely(path[depth].p_hdr->eh_entries !=
+ path[depth].p_hdr->eh_max)) {
+ EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+ path[depth].p_hdr->eh_entries,
+ path[depth].p_hdr->eh_max);
+ err = -EIO;
+ goto cleanup;
+ }
+ /* start copy from next extent */
+ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+ ext4_ext_show_move(inode, path, newblock, depth);
+ if (m) {
+ struct ext4_extent *ex;
+ ex = EXT_FIRST_EXTENT(neh);
+ memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
+ le16_add_cpu(&neh->eh_entries, m);
+ }
+
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (err)
+ goto cleanup;
+ brelse(bh);
+ bh = NULL;
+
+ /* correct old leaf */
+ if (m) {
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto cleanup;
+ le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto cleanup;
+
+ }
+
+ /* create intermediate indexes */
+ k = depth - at - 1;
+ if (unlikely(k < 0)) {
+ EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+ err = -EIO;
+ goto cleanup;
+ }
+ if (k)
+ ext_debug("create %d intermediate indices\n", k);
+ /* insert new index into current index block */
+ /* current depth stored in i var */
+ i = depth - 1;
+ while (k--) {
+ oldblock = newblock;
+ newblock = ablocks[--a];
+ bh = sb_getblk(inode->i_sb, newblock);
+ if (!bh) {
+ err = -EIO;
+ goto cleanup;
+ }
+ lock_buffer(bh);
+
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err)
+ goto cleanup;
+
+ neh = ext_block_hdr(bh);
+ neh->eh_entries = cpu_to_le16(1);
+ neh->eh_magic = EXT4_EXT_MAGIC;
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
+ neh->eh_depth = cpu_to_le16(depth - i);
+ fidx = EXT_FIRST_INDEX(neh);
+ fidx->ei_block = border;
+ ext4_idx_store_pblock(fidx, oldblock);
+
+ ext_debug("int.index at %d (block %llu): %u -> %llu\n",
+ i, newblock, le32_to_cpu(border), oldblock);
+
+ /* move remainder of path[i] to the new index block */
+ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+ EXT_LAST_INDEX(path[i].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+ le32_to_cpu(path[i].p_ext->ee_block));
+ err = -EIO;
+ goto cleanup;
+ }
+ /* start copy indexes */
+ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+ ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ EXT_MAX_INDEX(path[i].p_hdr));
+ ext4_ext_show_move(inode, path, newblock, i);
+ if (m) {
+ memmove(++fidx, path[i].p_idx,
+ sizeof(struct ext4_extent_idx) * m);
+ le16_add_cpu(&neh->eh_entries, m);
+ }
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (err)
+ goto cleanup;
+ brelse(bh);
+ bh = NULL;
+
+ /* correct old index */
+ if (m) {
+ err = ext4_ext_get_access(handle, inode, path + i);
+ if (err)
+ goto cleanup;
+ le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
+ err = ext4_ext_dirty(handle, inode, path + i);
+ if (err)
+ goto cleanup;
+ }
+
+ i--;
+ }
+
+ /* insert new index */
+ err = ext4_ext_insert_index(handle, inode, path + at,
+ le32_to_cpu(border), newblock);
+
+cleanup:
+ if (bh) {
+ if (buffer_locked(bh))
+ unlock_buffer(bh);
+ brelse(bh);
+ }
+
+ if (err) {
+ /* free all allocated blocks in error case */
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+ ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
+ EXT4_FREE_BLOCKS_METADATA);
+ }
+ }
+ kfree(ablocks);
+
+ return err;
+}
+
+/*
+ * ext4_ext_grow_indepth:
+ * implements tree growing procedure:
+ * - allocates new block
+ * - moves top-level data (index block or leaf) into the new block
+ * - initializes new top-level, creating index that points to the
+ * just created block
+ */
+static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
+ unsigned int flags,
+ struct ext4_extent *newext)
+{
+ struct ext4_extent_header *neh;
+ struct buffer_head *bh;
+ ext4_fsblk_t newblock;
+ int err = 0;
+
+ newblock = ext4_ext_new_meta_block(handle, inode, NULL,
+ newext, &err, flags);
+ if (newblock == 0)
+ return err;
+
+ bh = sb_getblk(inode->i_sb, newblock);
+ if (!bh) {
+ err = -EIO;
+ ext4_std_error(inode->i_sb, err);
+ return err;
+ }
+ lock_buffer(bh);
+
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err) {
+ unlock_buffer(bh);
+ goto out;
+ }
+
+ /* move top-level index/leaf into new block */
+ memmove(bh->b_data, EXT4_I(inode)->i_data,
+ sizeof(EXT4_I(inode)->i_data));
+
+ /* set size of new block */
+ neh = ext_block_hdr(bh);
+ /* old root could have indexes or leaves
+ * so calculate e_max right way */
+ if (ext_depth(inode))
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
+ else
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
+ neh->eh_magic = EXT4_EXT_MAGIC;
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (err)
+ goto out;
+
+ /* Update top-level index: num,max,pointer */
+ neh = ext_inode_hdr(inode);
+ neh->eh_entries = cpu_to_le16(1);
+ ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+ if (neh->eh_depth == 0) {
+ /* Root extent block becomes index block */
+ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+ EXT_FIRST_INDEX(neh)->ei_block =
+ EXT_FIRST_EXTENT(neh)->ee_block;
+ }
+ ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+ le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
+ le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+ ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
+
+ neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+ ext4_mark_inode_dirty(handle, inode);
+out:
+ brelse(bh);
+
+ return err;
+}
+
+/*
+ * ext4_ext_create_new_leaf:
+ * finds empty index and adds new leaf.
+ * if no free index is found, then it requests in-depth growing.
+ */
+static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
+{
+ struct ext4_ext_path *curp;
+ int depth, i, err = 0;
+
+repeat:
+ i = depth = ext_depth(inode);
+
+ /* walk up to the tree and look for free index entry */
+ curp = path + depth;
+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+ i--;
+ curp--;
+ }
+
+ /* we use already allocated block for index block,
+ * so subsequent data blocks should be contiguous */
+ if (EXT_HAS_FREE_INDEX(curp)) {
+ /* if we found index with free entry, then use that
+ * entry: create all needed subtree and add new leaf */
+ err = ext4_ext_split(handle, inode, flags, path, newext, i);
+ if (err)
+ goto out;
+
+ /* refill path */
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode,
+ (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+ path);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
+ } else {
+ /* tree is full, time to grow in depth */
+ err = ext4_ext_grow_indepth(handle, inode, flags, newext);
+ if (err)
+ goto out;
+
+ /* refill path */
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode,
+ (ext4_lblk_t)le32_to_cpu(newext->ee_block),
+ path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+
+ /*
+ * only first (depth 0 -> 1) produces free space;
+ * in all other cases we have to split the grown tree
+ */
+ depth = ext_depth(inode);
+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+ /* now we need to split */
+ goto repeat;
+ }
+ }
+
+out:
+ return err;
+}
+
+/*
+ * search the closest allocated block to the left for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the smallest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+static int ext4_ext_search_left(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *logical, ext4_fsblk_t *phys)
+{
+ struct ext4_extent_idx *ix;
+ struct ext4_extent *ex;
+ int depth, ee_len;
+
+ if (unlikely(path == NULL)) {
+ EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+ return -EIO;
+ }
+ depth = path->p_depth;
+ *phys = 0;
+
+ if (depth == 0 && path->p_ext == NULL)
+ return 0;
+
+ /* usually extent in the path covers blocks smaller
+ * then *logical, but it can be that extent is the
+ * first one in the file */
+
+ ex = path[depth].p_ext;
+ ee_len = ext4_ext_get_actual_len(ex);
+ if (*logical < le32_to_cpu(ex->ee_block)) {
+ if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+ EXT4_ERROR_INODE(inode,
+ "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+ *logical, le32_to_cpu(ex->ee_block));
+ return -EIO;
+ }
+ while (--depth >= 0) {
+ ix = path[depth].p_idx;
+ if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+ ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
+ EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+ le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+ depth);
+ return -EIO;
+ }
+ }
+ return 0;
+ }
+
+ if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d < ee_block %d + ee_len %d!",
+ *logical, le32_to_cpu(ex->ee_block), ee_len);
+ return -EIO;
+ }
+
+ *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
+ *phys = ext4_ext_pblock(ex) + ee_len - 1;
+ return 0;
+}
+
+/*
+ * search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys
+ * if *logical is the largest allocated block, the function
+ * returns 0 at @phys
+ * return value contains 0 (success) or error code
+ */
+static int ext4_ext_search_right(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *logical, ext4_fsblk_t *phys,
+ struct ext4_extent **ret_ex)
+{
+ struct buffer_head *bh = NULL;
+ struct ext4_extent_header *eh;
+ struct ext4_extent_idx *ix;
+ struct ext4_extent *ex;
+ ext4_fsblk_t block;
+ int depth; /* Note, NOT eh_depth; depth from top of tree */
+ int ee_len;
+
+ if (unlikely(path == NULL)) {
+ EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+ return -EIO;
+ }
+ depth = path->p_depth;
+ *phys = 0;
+
+ if (depth == 0 && path->p_ext == NULL)
+ return 0;
+
+ /* usually extent in the path covers blocks smaller
+ * then *logical, but it can be that extent is the
+ * first one in the file */
+
+ ex = path[depth].p_ext;
+ ee_len = ext4_ext_get_actual_len(ex);
+ if (*logical < le32_to_cpu(ex->ee_block)) {
+ if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+ EXT4_ERROR_INODE(inode,
+ "first_extent(path[%d].p_hdr) != ex",
+ depth);
+ return -EIO;
+ }
+ while (--depth >= 0) {
+ ix = path[depth].p_idx;
+ if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "ix != EXT_FIRST_INDEX *logical %d!",
+ *logical);
+ return -EIO;
+ }
+ }
+ goto found_extent;
+ }
+
+ if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+ EXT4_ERROR_INODE(inode,
+ "logical %d < ee_block %d + ee_len %d!",
+ *logical, le32_to_cpu(ex->ee_block), ee_len);
+ return -EIO;
+ }
+
+ if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
+ /* next allocated block in this leaf */
+ ex++;
+ goto found_extent;
+ }
+
+ /* go up and search for index to the right */
+ while (--depth >= 0) {
+ ix = path[depth].p_idx;
+ if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
+ goto got_index;
+ }
+
+ /* we've gone up to the root and found no index to the right */
+ return 0;
+
+got_index:
+ /* we've found index to the right, let's
+ * follow it and find the closest allocated
+ * block to the right */
+ ix++;
+ block = ext4_idx_pblock(ix);
+ while (++depth < path->p_depth) {
+ bh = sb_bread(inode->i_sb, block);
+ if (bh == NULL)
+ return -EIO;
+ eh = ext_block_hdr(bh);
+ /* subtract from p_depth to get proper eh_depth */
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+ put_bh(bh);
+ return -EIO;
+ }
+ ix = EXT_FIRST_INDEX(eh);
+ block = ext4_idx_pblock(ix);
+ put_bh(bh);
+ }
+
+ bh = sb_bread(inode->i_sb, block);
+ if (bh == NULL)
+ return -EIO;
+ eh = ext_block_hdr(bh);
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+ put_bh(bh);
+ return -EIO;
+ }
+ ex = EXT_FIRST_EXTENT(eh);
+found_extent:
+ *logical = le32_to_cpu(ex->ee_block);
+ *phys = ext4_ext_pblock(ex);
+ *ret_ex = ex;
+ if (bh)
+ put_bh(bh);
+ return 0;
+}
+
+/*
+ * ext4_ext_next_allocated_block:
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
+ * NOTE: it considers block number from index entry as
+ * allocated block. Thus, index entries have to be consistent
+ * with leaves.
+ */
+static ext4_lblk_t
+ext4_ext_next_allocated_block(struct ext4_ext_path *path)
+{
+ int depth;
+
+ BUG_ON(path == NULL);
+ depth = path->p_depth;
+
+ if (depth == 0 && path->p_ext == NULL)
+ return EXT_MAX_BLOCKS;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ /* leaf */
+ if (path[depth].p_ext &&
+ path[depth].p_ext !=
+ EXT_LAST_EXTENT(path[depth].p_hdr))
+ return le32_to_cpu(path[depth].p_ext[1].ee_block);
+ } else {
+ /* index */
+ if (path[depth].p_idx !=
+ EXT_LAST_INDEX(path[depth].p_hdr))
+ return le32_to_cpu(path[depth].p_idx[1].ei_block);
+ }
+ depth--;
+ }
+
+ return EXT_MAX_BLOCKS;
+}
+
+/*
+ * ext4_ext_next_leaf_block:
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
+ */
+static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
+{
+ int depth;
+
+ BUG_ON(path == NULL);
+ depth = path->p_depth;
+
+ /* zero-tree has no leaf blocks at all */
+ if (depth == 0)
+ return EXT_MAX_BLOCKS;
+
+ /* go to index block */
+ depth--;
+
+ while (depth >= 0) {
+ if (path[depth].p_idx !=
+ EXT_LAST_INDEX(path[depth].p_hdr))
+ return (ext4_lblk_t)
+ le32_to_cpu(path[depth].p_idx[1].ei_block);
+ depth--;
+ }
+
+ return EXT_MAX_BLOCKS;
+}
+
+/*
+ * ext4_ext_correct_indexes:
+ * if leaf gets modified and modified extent is first in the leaf,
+ * then we have to correct all indexes above.
+ * TODO: do we need to correct tree in all cases?
+ */
+static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent_header *eh;
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+ __le32 border;
+ int k, err = 0;
+
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+
+ if (unlikely(ex == NULL || eh == NULL)) {
+ EXT4_ERROR_INODE(inode,
+ "ex %p == NULL or eh %p == NULL", ex, eh);
+ return -EIO;
+ }
+
+ if (depth == 0) {
+ /* there is no tree at all */
+ return 0;
+ }
+
+ if (ex != EXT_FIRST_EXTENT(eh)) {
+ /* we correct tree if first leaf got modified only */
+ return 0;
+ }
+
+ /*
+ * TODO: we need correction if border is smaller than current one
+ */
+ k = depth - 1;
+ border = path[depth].p_ext->ee_block;
+ err = ext4_ext_get_access(handle, inode, path + k);
+ if (err)
+ return err;
+ path[k].p_idx->ei_block = border;
+ err = ext4_ext_dirty(handle, inode, path + k);
+ if (err)
+ return err;
+
+ while (k--) {
+ /* change all left-side indexes */
+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+ break;
+ err = ext4_ext_get_access(handle, inode, path + k);
+ if (err)
+ break;
+ path[k].p_idx->ei_block = border;
+ err = ext4_ext_dirty(handle, inode, path + k);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+int
+ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
+ struct ext4_extent *ex2)
+{
+ unsigned short ext1_ee_len, ext2_ee_len, max_len;
+
+ /*
+ * Make sure that either both extents are uninitialized, or
+ * both are _not_.
+ */
+ if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ return 0;
+
+ if (ext4_ext_is_uninitialized(ex1))
+ max_len = EXT_UNINIT_MAX_LEN;
+ else
+ max_len = EXT_INIT_MAX_LEN;
+
+ ext1_ee_len = ext4_ext_get_actual_len(ex1);
+ ext2_ee_len = ext4_ext_get_actual_len(ex2);
+
+ if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
+ le32_to_cpu(ex2->ee_block))
+ return 0;
+
+ /*
+ * To allow future support for preallocated extents to be added
+ * as an RO_COMPAT feature, refuse to merge to extents if
+ * this can result in the top bit of ee_len being set.
+ */
+ if (ext1_ee_len + ext2_ee_len > max_len)
+ return 0;
+#ifdef AGGRESSIVE_TEST
+ if (ext1_ee_len >= 4)
+ return 0;
+#endif
+
+ if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
+ return 1;
+ return 0;
+}
+
+/*
+ * This function tries to merge the "ex" extent to the next extent in the tree.
+ * It always tries to merge towards right. If you want to merge towards
+ * left, pass "ex - 1" as argument instead of "ex".
+ * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
+ * 1 if they got merged.
+ */
+static int ext4_ext_try_to_merge_right(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex)
+{
+ struct ext4_extent_header *eh;
+ unsigned int depth, len;
+ int merge_done = 0;
+ int uninitialized = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ while (ex < EXT_LAST_EXTENT(eh)) {
+ if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
+ break;
+ /* merge with next extent! */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(ex + 1));
+ if (uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+
+ if (ex + 1 < EXT_LAST_EXTENT(eh)) {
+ len = (EXT_LAST_EXTENT(eh) - ex - 1)
+ * sizeof(struct ext4_extent);
+ memmove(ex + 1, ex + 2, len);
+ }
+ le16_add_cpu(&eh->eh_entries, -1);
+ merge_done = 1;
+ WARN_ON(eh->eh_entries == 0);
+ if (!eh->eh_entries)
+ EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
+ }
+
+ return merge_done;
+}
+
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex) {
+ struct ext4_extent_header *eh;
+ unsigned int depth;
+ int merge_done = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ if (ex > EXT_FIRST_EXTENT(eh))
+ merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+ if (!merge_done)
+ ret = ext4_ext_try_to_merge_right(inode, path, ex);
+
+ return ret;
+}
+
+/*
+ * check if a portion of the "newext" extent overlaps with an
+ * existing extent.
+ *
+ * If there is an overlap discovered, it updates the length of the newext
+ * such that there will be no overlap, and then returns 1.
+ * If there is no overlap found, it returns 0.
+ */
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+ struct inode *inode,
+ struct ext4_extent *newext,
+ struct ext4_ext_path *path)
+{
+ ext4_lblk_t b1, b2;
+ unsigned int depth, len1;
+ unsigned int ret = 0;
+
+ b1 = le32_to_cpu(newext->ee_block);
+ len1 = ext4_ext_get_actual_len(newext);
+ depth = ext_depth(inode);
+ if (!path[depth].p_ext)
+ goto out;
+ b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+ b2 &= ~(sbi->s_cluster_ratio - 1);
+
+ /*
+ * get the next allocated block if the extent in the path
+ * is before the requested block(s)
+ */
+ if (b2 < b1) {
+ b2 = ext4_ext_next_allocated_block(path);
+ if (b2 == EXT_MAX_BLOCKS)
+ goto out;
+ b2 &= ~(sbi->s_cluster_ratio - 1);
+ }
+
+ /* check for wrap through zero on extent logical start block*/
+ if (b1 + len1 < b1) {
+ len1 = EXT_MAX_BLOCKS - b1;
+ newext->ee_len = cpu_to_le16(len1);
+ ret = 1;
+ }
+
+ /* check for overlap */
+ if (b1 + len1 > b2) {
+ newext->ee_len = cpu_to_le16(b2 - b1);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
+/*
+ * ext4_ext_insert_extent:
+ * tries to merge requsted extent into the existing extent or
+ * inserts requested extent as new one into the tree,
+ * creating new leaf in the no-space case.
+ */
+int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int flag)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_extent *ex, *fex;
+ struct ext4_extent *nearex; /* nearest extent */
+ struct ext4_ext_path *npath = NULL;
+ int depth, len, err;
+ ext4_lblk_t next;
+ unsigned uninitialized = 0;
+ int flags = 0;
+
+ if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+ EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+ return -EIO;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ return -EIO;
+ }
+
+ /* try to insert block into found extent and return */
+ if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
+ && ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ ext4_ext_pblock(ex));
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ return err;
+
+ /*
+ * ext4_can_extents_be_merged should have checked that either
+ * both extents are uninitialized, or both aren't. Thus we
+ * need to check only one of them here.
+ */
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ + ext4_ext_get_actual_len(newext));
+ if (uninitialized)
+ ext4_ext_mark_uninitialized(ex);
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
+ goto has_space;
+
+ /* probably next leaf has space for us? */
+ fex = EXT_LAST_EXTENT(eh);
+ next = EXT_MAX_BLOCKS;
+ if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
+ next = ext4_ext_next_leaf_block(path);
+ if (next != EXT_MAX_BLOCKS) {
+ ext_debug("next leaf block - %u\n", next);
+ BUG_ON(npath != NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL);
+ if (IS_ERR(npath))
+ return PTR_ERR(npath);
+ BUG_ON(npath->p_depth != path->p_depth);
+ eh = npath[depth].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
+ ext_debug("next leaf isn't full(%d)\n",
+ le16_to_cpu(eh->eh_entries));
+ path = npath;
+ goto has_space;
+ }
+ ext_debug("next leaf has no free space(%d,%d)\n",
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+ }
+
+ /*
+ * There is no free space in the found leaf.
+ * We're gonna add a new leaf in the tree.
+ */
+ if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+ flags = EXT4_MB_USE_ROOT_BLOCKS;
+ err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
+ if (err)
+ goto cleanup;
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+
+has_space:
+ nearex = path[depth].p_ext;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto cleanup;
+
+ if (!nearex) {
+ /* there is no extent in this leaf, create first one */
+ ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext));
+ nearex = EXT_FIRST_EXTENT(eh);
+ } else {
+ if (le32_to_cpu(newext->ee_block)
+ > le32_to_cpu(nearex->ee_block)) {
+ /* Insert after */
+ ext_debug("insert %u:%llu:[%d]%d before: "
+ "nearest %p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ nearex);
+ nearex++;
+ } else {
+ /* Insert before */
+ BUG_ON(newext->ee_block == nearex->ee_block);
+ ext_debug("insert %u:%llu:[%d]%d after: "
+ "nearest %p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ nearex);
+ }
+ len = EXT_LAST_EXTENT(eh) - nearex + 1;
+ if (len > 0) {
+ ext_debug("insert %u:%llu:[%d]%d: "
+ "move %d extents from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext4_ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
+ ext4_ext_get_actual_len(newext),
+ len, nearex, nearex + 1);
+ memmove(nearex + 1, nearex,
+ len * sizeof(struct ext4_extent));
+ }
+ }
+
+ le16_add_cpu(&eh->eh_entries, 1);
+ path[depth].p_ext = nearex;
+ nearex->ee_block = newext->ee_block;
+ ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
+ nearex->ee_len = newext->ee_len;
+
+merge:
+ /* try to merge extents to the right */
+ if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(inode, path, nearex);
+
+ /* try to merge extents to the left */
+
+ /* time to correct all indexes above */
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto cleanup;
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+ if (npath) {
+ ext4_ext_drop_refs(npath);
+ kfree(npath);
+ }
+ ext4_ext_invalidate_cache(inode);
+ return err;
+}
+
+static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+ ext4_lblk_t num, ext_prepare_callback func,
+ void *cbdata)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_ext_cache cbex;
+ struct ext4_extent *ex;
+ ext4_lblk_t next, start = 0, end = 0;
+ ext4_lblk_t last = block + num;
+ int depth, exists, err = 0;
+
+ BUG_ON(func == NULL);
+ BUG_ON(inode == NULL);
+
+ while (block < last && block != EXT_MAX_BLOCKS) {
+ num = last - block;
+ /* find extent for this block */
+ down_read(&EXT4_I(inode)->i_data_sem);
+ path = ext4_ext_find_extent(inode, block, path);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ break;
+ }
+
+ depth = ext_depth(inode);
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ err = -EIO;
+ break;
+ }
+ ex = path[depth].p_ext;
+ next = ext4_ext_next_allocated_block(path);
+
+ exists = 0;
+ if (!ex) {
+ /* there is no extent yet, so try to allocate
+ * all requested space */
+ start = block;
+ end = block + num;
+ } else if (le32_to_cpu(ex->ee_block) > block) {
+ /* need to allocate space before found extent */
+ start = block;
+ end = le32_to_cpu(ex->ee_block);
+ if (block + num < end)
+ end = block + num;
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
+ /* need to allocate space after found extent */
+ start = block;
+ end = block + num;
+ if (end >= next)
+ end = next;
+ } else if (block >= le32_to_cpu(ex->ee_block)) {
+ /*
+ * some part of requested space is covered
+ * by found extent
+ */
+ start = block;
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
+ if (block + num < end)
+ end = block + num;
+ exists = 1;
+ } else {
+ BUG();
+ }
+ BUG_ON(end <= start);
+
+ if (!exists) {
+ cbex.ec_block = start;
+ cbex.ec_len = end - start;
+ cbex.ec_start = 0;
+ } else {
+ cbex.ec_block = le32_to_cpu(ex->ee_block);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
+ cbex.ec_start = ext4_ext_pblock(ex);
+ }
+
+ if (unlikely(cbex.ec_len == 0)) {
+ EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+ err = -EIO;
+ break;
+ }
+ err = func(inode, next, &cbex, ex, cbdata);
+ ext4_ext_drop_refs(path);
+
+ if (err < 0)
+ break;
+
+ if (err == EXT_REPEAT)
+ continue;
+ else if (err == EXT_BREAK) {
+ err = 0;
+ break;
+ }
+
+ if (ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
+ block = cbex.ec_block + cbex.ec_len;
+ }
+
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+
+ return err;
+}
+
+static void
+ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
+ __u32 len, ext4_fsblk_t start)
+{
+ struct ext4_ext_cache *cex;
+ BUG_ON(len == 0);
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ trace_ext4_ext_put_in_cache(inode, block, len, start);
+ cex = &EXT4_I(inode)->i_cached_extent;
+ cex->ec_block = block;
+ cex->ec_len = len;
+ cex->ec_start = start;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+ ext4_lblk_t block)
+{
+ int depth = ext_depth(inode);
+ unsigned long len;
+ ext4_lblk_t lblock;
+ struct ext4_extent *ex;
+
+ ex = path[depth].p_ext;
+ if (ex == NULL) {
+ /* there is no extent yet, so gap is [0;-] */
+ lblock = 0;
+ len = EXT_MAX_BLOCKS;
+ ext_debug("cache gap(whole file):");
+ } else if (block < le32_to_cpu(ex->ee_block)) {
+ lblock = block;
+ len = le32_to_cpu(ex->ee_block) - block;
+ ext_debug("cache gap(before): %u [%u:%u]",
+ block,
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
+ ext4_lblk_t next;
+ lblock = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
+
+ next = ext4_ext_next_allocated_block(path);
+ ext_debug("cache gap(after): [%u:%u] %u",
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex),
+ block);
+ BUG_ON(next == lblock);
+ len = next - lblock;
+ } else {
+ lblock = len = 0;
+ BUG();
+ }
+
+ ext_debug(" -> %u:%lu\n", lblock, len);
+ ext4_ext_put_in_cache(inode, lblock, len, 0);
+}
+
+/*
+ * ext4_ext_check_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer. If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex: Pointer where the cached extent will be stored
+ * if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_ext_cache *ex){
+ struct ext4_ext_cache *cex;
+ struct ext4_sb_info *sbi;
+ int ret = 0;
+
+ /*
+ * We borrow i_block_reservation_lock to protect i_cached_extent
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ cex = &EXT4_I(inode)->i_cached_extent;
+ sbi = EXT4_SB(inode->i_sb);
+
+ /* has cache valid data? */
+ if (cex->ec_len == 0)
+ goto errout;
+
+ if (in_range(block, cex->ec_block, cex->ec_len)) {
+ memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+ ext_debug("%u cached by %u:%u:%llu\n",
+ block,
+ cex->ec_block, cex->ec_len, cex->ec_start);
+ ret = 1;
+ }
+errout:
+ trace_ext4_ext_in_cache(inode, block, ret);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return ret;
+}
+
+/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex: Pointer where the cached extent will be stored
+ * if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_extent *ex)
+{
+ struct ext4_ext_cache cex;
+ int ret = 0;
+
+ if (ext4_ext_check_cache(inode, block, &cex)) {
+ ex->ee_block = cpu_to_le32(cex.ec_block);
+ ext4_ext_store_pblock(ex, cex.ec_start);
+ ex->ee_len = cpu_to_le16(cex.ec_len);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+/*
+ * ext4_ext_rm_idx:
+ * removes index from the index block.
+ */
+static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int err;
+ ext4_fsblk_t leaf;
+
+ /* free index block */
+ path--;
+ leaf = ext4_idx_pblock(path->p_idx);
+ if (unlikely(path->p_hdr->eh_entries == 0)) {
+ EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+ return -EIO;
+ }
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err)
+ return err;
+
+ if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
+ int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+ len *= sizeof(struct ext4_extent_idx);
+ memmove(path->p_idx, path->p_idx + 1, len);
+ }
+
+ le16_add_cpu(&path->p_hdr->eh_entries, -1);
+ err = ext4_ext_dirty(handle, inode, path);
+ if (err)
+ return err;
+ ext_debug("index is empty, remove it, free block %llu\n", leaf);
+ trace_ext4_ext_rm_idx(inode, leaf);
+
+ ext4_free_blocks(handle, inode, NULL, leaf, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+ return err;
+}
+
+/*
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
+ */
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
+ struct ext4_ext_path *path)
+{
+ if (path) {
+ int depth = ext_depth(inode);
+ int ret = 0;
+
+ /* probably there is space in leaf? */
+ if (le16_to_cpu(path[depth].p_hdr->eh_entries)
+ < le16_to_cpu(path[depth].p_hdr->eh_max)) {
+
+ /*
+ * There are some space in the leaf tree, no
+ * need to account for leaf block credit
+ *
+ * bitmaps and block group descriptor blocks
+ * and other metadata blocks still need to be
+ * accounted.
+ */
+ /* 1 bitmap, 1 block group descriptor */
+ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+ return ret;
+ }
+ }
+
+ return ext4_chunk_trans_blocks(inode, nrblocks);
+}
+
+/*
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ *
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
+ * in the worse case, each tree level index/leaf need to be changed
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ int index;
+ int depth = ext_depth(inode);
+
+ if (chunk)
+ index = depth * 2;
+ else
+ index = depth * 3;
+
+ return index;
+}
+
+static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_extent *ex,
+ ext4_fsblk_t *partial_cluster,
+ ext4_lblk_t from, ext4_lblk_t to)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
+ ext4_fsblk_t pblk;
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+ /*
+ * For bigalloc file systems, we never free a partial cluster
+ * at the beginning of the extent. Instead, we make a note
+ * that we tried freeing the cluster, and check to see if we
+ * need to free it on a subsequent call to ext4_remove_blocks,
+ * or at the end of the ext4_truncate() operation.
+ */
+ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+ trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+ /*
+ * If we have a partial cluster, and it's different from the
+ * cluster of the last block, we need to explicitly free the
+ * partial cluster here.
+ */
+ pblk = ext4_ext_pblock(ex) + ee_len - 1;
+ if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
+
+#ifdef EXTENTS_STATS
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ spin_lock(&sbi->s_ext_stats_lock);
+ sbi->s_ext_blocks += ee_len;
+ sbi->s_ext_extents++;
+ if (ee_len < sbi->s_ext_min)
+ sbi->s_ext_min = ee_len;
+ if (ee_len > sbi->s_ext_max)
+ sbi->s_ext_max = ee_len;
+ if (ext_depth(inode) > sbi->s_depth_max)
+ sbi->s_depth_max = ext_depth(inode);
+ spin_unlock(&sbi->s_ext_stats_lock);
+ }
+#endif
+ if (from >= le32_to_cpu(ex->ee_block)
+ && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
+ /* tail removal */
+ ext4_lblk_t num;
+
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ pblk = ext4_ext_pblock(ex) + ee_len - num;
+ ext_debug("free last %u blocks starting %llu\n", num, pblk);
+ ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+ /*
+ * If the block range to be freed didn't start at the
+ * beginning of a cluster, and we removed the entire
+ * extent, save the partial cluster here, since we
+ * might need to delete if we determine that the
+ * truncate operation has removed all of the blocks in
+ * the cluster.
+ */
+ if (pblk & (sbi->s_cluster_ratio - 1) &&
+ (ee_len == num))
+ *partial_cluster = EXT4_B2C(sbi, pblk);
+ else
+ *partial_cluster = 0;
+ } else if (from == le32_to_cpu(ex->ee_block)
+ && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
+ /* head removal */
+ ext4_lblk_t num;
+ ext4_fsblk_t start;
+
+ num = to - from;
+ start = ext4_ext_pblock(ex);
+
+ ext_debug("free first %u blocks starting %llu\n", num, start);
+ ext4_free_blocks(handle, inode, NULL, start, num, flags);
+
+ } else {
+ printk(KERN_INFO "strange request: removal(2) "
+ "%u-%u from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
+ }
+ return 0;
+}
+
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode: The files inode
+ * @path: The path to the leaf
+ * @start: The first block to remove
+ * @end: The last block to remove
+ */
+static int
+ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster,
+ ext4_lblk_t start, ext4_lblk_t end)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int err = 0, correct_index = 0;
+ int depth = ext_depth(inode), credits;
+ struct ext4_extent_header *eh;
+ ext4_lblk_t a, b;
+ unsigned num;
+ ext4_lblk_t ex_ee_block;
+ unsigned short ex_ee_len;
+ unsigned uninitialized = 0;
+ struct ext4_extent *ex;
+
+ /* the header must be checked already in ext4_ext_remove_space() */
+ ext_debug("truncate since %u in leaf to %u\n", start, end);
+ if (!path[depth].p_hdr)
+ path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+ eh = path[depth].p_hdr;
+ if (unlikely(path[depth].p_hdr == NULL)) {
+ EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+ return -EIO;
+ }
+ /* find where to start removing */
+ ex = EXT_LAST_EXTENT(eh);
+
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+
+ trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
+ while (ex >= EXT_FIRST_EXTENT(eh) &&
+ ex_ee_block + ex_ee_len > start) {
+
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ else
+ uninitialized = 0;
+
+ ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ uninitialized, ex_ee_len);
+ path[depth].p_ext = ex;
+
+ a = ex_ee_block > start ? ex_ee_block : start;
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
+
+ ext_debug(" border %u:%u\n", a, b);
+
+ /* If this extent is beyond the end of the hole, skip it */
+ if (end < ex_ee_block) {
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+ continue;
+ } else if (b != ex_ee_block + ex_ee_len - 1) {
+ EXT4_ERROR_INODE(inode,
+ "can not handle truncate %u:%u "
+ "on extent %u:%u",
+ start, end, ex_ee_block,
+ ex_ee_block + ex_ee_len - 1);
+ err = -EIO;
+ goto out;
+ } else if (a != ex_ee_block) {
+ /* remove tail of the extent */
+ num = a - ex_ee_block;
+ } else {
+ /* remove whole extent: excellent! */
+ num = 0;
+ }
+ /*
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups plus ex_ee_len/blocks_per_block_group for
+ * the worst case
+ */
+ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
+ if (ex == EXT_FIRST_EXTENT(eh)) {
+ correct_index = 1;
+ credits += (ext_depth(inode)) + 1;
+ }
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ if (err)
+ goto out;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+ a, b);
+ if (err)
+ goto out;
+
+ if (num == 0)
+ /* this extent is removed; mark slot entirely unused */
+ ext4_ext_store_pblock(ex, 0);
+
+ ex->ee_len = cpu_to_le16(num);
+ /*
+ * Do not mark uninitialized if all the blocks in the
+ * extent have been removed.
+ */
+ if (uninitialized && num)
+ ext4_ext_mark_uninitialized(ex);
+ /*
+ * If the extent was completely released,
+ * we need to remove it from the leaf
+ */
+ if (num == 0) {
+ if (end != EXT_MAX_BLOCKS - 1) {
+ /*
+ * For hole punching, we need to scoot all the
+ * extents up when an extent is removed so that
+ * we dont have blank extents in the middle
+ */
+ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+ sizeof(struct ext4_extent));
+
+ /* Now get rid of the one at the end */
+ memset(EXT_LAST_EXTENT(eh), 0,
+ sizeof(struct ext4_extent));
+ }
+ le16_add_cpu(&eh->eh_entries, -1);
+ } else
+ *partial_cluster = 0;
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
+ ext4_ext_pblock(ex));
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+ }
+
+ if (correct_index && eh->eh_entries)
+ err = ext4_ext_correct_indexes(handle, inode, path);
+
+ /*
+ * If there is still a entry in the leaf node, check to see if
+ * it references the partial cluster. This is the only place
+ * where it could; if it doesn't, we can free the cluster.
+ */
+ if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) &&
+ (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+ *partial_cluster)) {
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, *partial_cluster),
+ sbi->s_cluster_ratio, flags);
+ *partial_cluster = 0;
+ }
+
+ /* if this leaf is free, then we should
+ * remove it from index block above */
+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+ err = ext4_ext_rm_idx(handle, inode, path + depth);
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_more_to_rm:
+ * returns 1 if current index has to be freed (even partial)
+ */
+static int
+ext4_ext_more_to_rm(struct ext4_ext_path *path)
+{
+ BUG_ON(path->p_idx == NULL);
+
+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+ return 0;
+
+ /*
+ * if truncate on deeper level happened, it wasn't partial,
+ * so we have to consider current index for truncation
+ */
+ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
+ return 0;
+ return 1;
+}
+
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct super_block *sb = inode->i_sb;
+ int depth = ext_depth(inode);
+ struct ext4_ext_path *path;
+ ext4_fsblk_t partial_cluster = 0;
+ handle_t *handle;
+ int i, err;
+
+ ext_debug("truncate since %u to %u\n", start, end);
+
+ /* probably first extent we're gonna free will be last in block */
+ handle = ext4_journal_start(inode, depth + 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+again:
+ ext4_ext_invalidate_cache(inode);
+
+ trace_ext4_ext_remove_space(inode, start, depth);
+
+ /*
+ * Check if we are removing extents inside the extent tree. If that
+ * is the case, we are going to punch a hole inside the extent tree
+ * so we have to check whether we need to split the extent covering
+ * the last block to remove so we can easily remove the part of it
+ * in ext4_ext_rm_leaf().
+ */
+ if (end < EXT_MAX_BLOCKS - 1) {
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, end, NULL);
+ if (IS_ERR(path)) {
+ ext4_journal_stop(handle);
+ return PTR_ERR(path);
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex)
+ goto cont;
+
+ ee_block = le32_to_cpu(ex->ee_block);
+
+ /*
+ * See if the last block is inside the extent, if so split
+ * the extent at 'end' block so we can easily remove the
+ * tail of the first part of the split extent in
+ * ext4_ext_rm_leaf().
+ */
+ if (end >= ee_block &&
+ end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+ int split_flag = 0;
+
+ if (ext4_ext_is_uninitialized(ex))
+ split_flag = EXT4_EXT_MARK_UNINIT1 |
+ EXT4_EXT_MARK_UNINIT2;
+
+ /*
+ * Split the extent in two so that 'end' is the last
+ * block in the first new extent
+ */
+ err = ext4_split_extent_at(handle, inode, path,
+ end + 1, split_flag,
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+ if (err < 0)
+ goto out;
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+cont:
+
+ /*
+ * We start scanning from right side, freeing all the blocks
+ * after i_size and walking into the tree depth-wise.
+ */
+ depth = ext_depth(inode);
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
+ if (path == NULL) {
+ ext4_journal_stop(handle);
+ return -ENOMEM;
+ }
+ path[0].p_depth = depth;
+ path[0].p_hdr = ext_inode_hdr(inode);
+
+ if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+ err = -EIO;
+ goto out;
+ }
+ i = err = 0;
+
+ while (i >= 0 && err == 0) {
+ if (i == depth) {
+ /* this is leaf block */
+ err = ext4_ext_rm_leaf(handle, inode, path,
+ &partial_cluster, start,
+ end);
+ /* root level has p_bh == NULL, brelse() eats this */
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+
+ /* this is index block */
+ if (!path[i].p_hdr) {
+ ext_debug("initialize header\n");
+ path[i].p_hdr = ext_block_hdr(path[i].p_bh);
+ }
+
+ if (!path[i].p_idx) {
+ /* this level hasn't been touched yet */
+ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
+ ext_debug("init index ptr: hdr 0x%p, num %d\n",
+ path[i].p_hdr,
+ le16_to_cpu(path[i].p_hdr->eh_entries));
+ } else {
+ /* we were already here, see at next index */
+ path[i].p_idx--;
+ }
+
+ ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+ i, EXT_FIRST_INDEX(path[i].p_hdr),
+ path[i].p_idx);
+ if (ext4_ext_more_to_rm(path + i)) {
+ struct buffer_head *bh;
+ /* go to the next level */
+ ext_debug("move to level %d (block %llu)\n",
+ i + 1, ext4_idx_pblock(path[i].p_idx));
+ memset(path + i + 1, 0, sizeof(*path));
+ bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
+ if (!bh) {
+ /* should we reset i_size? */
+ err = -EIO;
+ break;
+ }
+ if (WARN_ON(i + 1 > depth)) {
+ err = -EIO;
+ break;
+ }
+ if (ext4_ext_check(inode, ext_block_hdr(bh),
+ depth - i - 1)) {
+ err = -EIO;
+ break;
+ }
+ path[i + 1].p_bh = bh;
+
+ /* save actual number of indexes since this
+ * number is changed at the next iteration */
+ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
+ i++;
+ } else {
+ /* we finished processing this index, go up */
+ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
+ /* index is empty, remove it;
+ * handle must be already prepared by the
+ * truncatei_leaf() */
+ err = ext4_ext_rm_idx(handle, inode, path + i);
+ }
+ /* root level has p_bh == NULL, brelse() eats this */
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ ext_debug("return to level %d\n", i);
+ }
+ }
+
+ trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster,
+ path->p_hdr->eh_entries);
+
+ /* If we still have something in the partial cluster and we have removed
+ * even the first extent, then we should free the blocks in the partial
+ * cluster as well. */
+ if (partial_cluster && path->p_hdr->eh_entries == 0) {
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(EXT4_SB(sb), partial_cluster),
+ EXT4_SB(sb)->s_cluster_ratio, flags);
+ partial_cluster = 0;
+ }
+
+ /* TODO: flexible tree reduction should be here */
+ if (path->p_hdr->eh_entries == 0) {
+ /*
+ * truncate to zero freed all the tree,
+ * so we need to correct eh_depth
+ */
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err == 0) {
+ ext_inode_hdr(inode)->eh_depth = 0;
+ ext_inode_hdr(inode)->eh_max =
+ cpu_to_le16(ext4_ext_space_root(inode, 0));
+ err = ext4_ext_dirty(handle, inode, path);
+ }
+ }
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (err == -EAGAIN)
+ goto again;
+ ext4_journal_stop(handle);
+
+ return err;
+}
+
+/*
+ * called at mount time
+ */
+void ext4_ext_init(struct super_block *sb)
+{
+ /*
+ * possible initialization would be here
+ */
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
+ printk(KERN_INFO "EXT4-fs: file extents enabled"
+#ifdef AGGRESSIVE_TEST
+ ", aggressive tests"
+#endif
+#ifdef CHECK_BINSEARCH
+ ", check binsearch"
+#endif
+#ifdef EXTENTS_STATS
+ ", stats"
+#endif
+ "\n");
+#endif
+#ifdef EXTENTS_STATS
+ spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
+ EXT4_SB(sb)->s_ext_min = 1 << 30;
+ EXT4_SB(sb)->s_ext_max = 0;
+#endif
+ }
+}
+
+/*
+ * called at umount time
+ */
+void ext4_ext_release(struct super_block *sb)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ return;
+
+#ifdef EXTENTS_STATS
+ if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
+ sbi->s_ext_blocks, sbi->s_ext_extents,
+ sbi->s_ext_blocks / sbi->s_ext_extents);
+ printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
+ sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
+ }
+#endif
+}
+
+/* FIXME!! we need to try to merge to left or right after zero-out */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+ ext4_fsblk_t ee_pblock;
+ unsigned int ee_len;
+ int ret;
+
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_pblock = ext4_ext_pblock(ex);
+
+ ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ * the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ * a> the extent are splitted into two extent.
+ * b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags)
+{
+ ext4_fsblk_t newblock;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex2 = NULL;
+ unsigned int ee_len, depth;
+ int err = 0;
+
+ ext_debug("ext4_split_extents_at: inode %lu, logical"
+ "block %llu\n", inode->i_ino, (unsigned long long)split);
+
+ ext4_ext_show_leaf(inode, path);
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ newblock = split - ee_block + ext4_ext_pblock(ex);
+
+ BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (split == ee_block) {
+ /*
+ * case b: block @split is the block that the extent begins with
+ * then we just change the state of the extent, and splitting
+ * is not needed.
+ */
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ ext4_ext_mark_uninitialized(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+
+ if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(inode, path, ex);
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ }
+
+ /* case a */
+ memcpy(&orig_ex, ex, sizeof(orig_ex));
+ ex->ee_len = cpu_to_le16(split - ee_block);
+ if (split_flag & EXT4_EXT_MARK_UNINIT1)
+ ext4_ext_mark_uninitialized(ex);
+
+ /*
+ * path may lead to new leaf, not to original leaf any more
+ * after ext4_ext_insert_extent() returns,
+ */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto fix_extent_len;
+
+ ex2 = &newex;
+ ex2->ee_block = cpu_to_le32(split);
+ ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
+ ext4_ext_store_pblock(ex2, newblock);
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ ext4_ext_mark_uninitialized(ex2);
+
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ } else if (err)
+ goto fix_extent_len;
+
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+
+fix_extent_len:
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ * a> There is no split required
+ * b> Splits in two extents: Split is happening at either end of the extent
+ * c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags)
+{
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len, depth;
+ int err = 0;
+ int uninitialized;
+ int split_flag1, flags1;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ uninitialized = ext4_ext_is_uninitialized(ex);
+
+ if (map->m_lblk + map->m_len < ee_block + ee_len) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ if (uninitialized)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+ EXT4_EXT_MARK_UNINIT2;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk + map->m_len, split_flag1, flags1);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ if (map->m_lblk >= ee_block) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ if (uninitialized)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk, split_flag1, flags);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_show_leaf(inode, path);
+out:
+ return err ? err : map->m_len;
+}
+
+#define EXT4_EXT_ZERO_LEN 7
+/*
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
+ * to an uninitialized extent. It may result in splitting the uninitialized
+ * extent into multiple extents (up to three - one initialized and two
+ * uninitialized).
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be initialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
+ */
+static int ext4_ext_convert_to_initialized(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent_header *eh;
+ struct ext4_map_blocks split_map;
+ struct ext4_extent zero_ex;
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block, eof_block;
+ unsigned int ee_len, depth;
+ int allocated;
+ int err = 0;
+ int split_flag = 0;
+
+ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (map->m_lblk - ee_block);
+
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+
+ /*
+ * Attempt to transfer newly initialized blocks from the currently
+ * uninitialized extent to its left neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. This is the common case in steady state for
+ * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+ * writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we only deal with writes at the start of the extent.
+ * The approach could be extended to writes at the end
+ * of the extent but this scenario was deemed less common.
+ * - L2: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L3: we only attempt to merge with an extent stored in the
+ * same extent tree node.
+ */
+ if ((map->m_lblk == ee_block) && /*L1*/
+ (map->m_len < ee_len) && /*L2*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
+ struct ext4_extent *prev_ex;
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len, write_len;
+
+ prev_ex = ex - 1;
+ prev_lblk = le32_to_cpu(prev_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(prev_ex);
+ prev_pblk = ext4_ext_pblock(prev_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+ write_len = map->m_len;
+
+ /*
+ * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * upon those conditions:
+ * - C1: prev_ex is initialized,
+ * - C2: prev_ex is logically abutting ex,
+ * - C3: prev_ex is physically abutting ex,
+ * - C4: prev_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, prev_ex);
+
+ /* Shift the start of ex by 'write_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + write_len);
+ ext4_ext_store_pblock(ex, ee_pblk + write_len);
+ ex->ee_len = cpu_to_le16(ee_len - write_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+ /* Extend prev_ex by 'write_len' blocks */
+ prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = prev_ex;
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = write_len;
+ goto out;
+ }
+ }
+
+ WARN_ON(map->m_lblk < ee_block);
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+ /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ err = ext4_ext_zeroout(inode, ex);
+ if (err)
+ goto out;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_try_to_merge(inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ }
+
+ /*
+ * four cases:
+ * 1. split the extent into three extents.
+ * 2. split the extent into two extents, zeroout the first half.
+ * 3. split the extent into two extents, zeroout the second half.
+ * 4. split the extent into two extents with out zeroout.
+ */
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = map->m_len;
+
+ if (allocated > map->m_len) {
+ if (allocated <= EXT4_EXT_ZERO_LEN &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ /* case 3 */
+ zero_ex.ee_block =
+ cpu_to_le32(map->m_lblk);
+ zero_ex.ee_len = cpu_to_le16(allocated);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+ err = ext4_ext_zeroout(inode, &zero_ex);
+ if (err)
+ goto out;
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = allocated;
+ } else if ((map->m_lblk - ee_block + map->m_len <
+ EXT4_EXT_ZERO_LEN) &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ /* case 2 */
+ if (map->m_lblk != ee_block) {
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+ ee_block);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ err = ext4_ext_zeroout(inode, &zero_ex);
+ if (err)
+ goto out;
+ }
+
+ split_map.m_lblk = ee_block;
+ split_map.m_len = map->m_lblk - ee_block + map->m_len;
+ allocated = map->m_len;
+ }
+ }
+
+ allocated = ext4_split_extent(handle, inode, path,
+ &split_map, split_flag, 0);
+ if (allocated < 0)
+ err = allocated;
+
+out:
+ return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitialized extent may result in splitting the uninitialized
+ * extent into multiple /initialized uninitialized extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be uninitialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the uninitialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitialized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of uninitialized extent to be written on success.
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path,
+ int flags)
+{
+ ext4_lblk_t eof_block;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len;
+ int split_flag = 0, depth;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= EXT4_EXT_MARK_UNINIT2;
+
+ flags |= EXT4_GET_BLOCKS_PRE_IO;
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+}
+
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ int depth;
+ int err = 0;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+
+ ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(inode, path, ex);
+
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+ sector_t block, int count)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ unmap_underlying_metadata(bdev, block + i);
+}
+
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+ ext4_lblk_t lblk,
+ struct ext4_ext_path *path,
+ unsigned int len)
+{
+ int i, depth;
+ struct ext4_extent_header *eh;
+ struct ext4_extent *last_ex;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+ return 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+
+ /*
+ * We're going to remove EOFBLOCKS_FL entirely in future so we
+ * do not care for this case anymore. Simply remove the flag
+ * if there are no extents.
+ */
+ if (unlikely(!eh->eh_entries))
+ goto out;
+ last_ex = EXT_LAST_EXTENT(eh);
+ /*
+ * We should clear the EOFBLOCKS_FL flag if we are writing the
+ * last block in the last extent in the file. We test this by
+ * first checking to see if the caller to
+ * ext4_ext_get_blocks() was interested in the last block (or
+ * a block beyond the last block) in the current extent. If
+ * this turns out to be false, we can bail out from this
+ * function immediately.
+ */
+ if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+ ext4_ext_get_actual_len(last_ex))
+ return 0;
+ /*
+ * If the caller does appear to be planning to write at or
+ * beyond the end of the current extent, we then test to see
+ * if the current extent is the last extent in the file, by
+ * checking to make sure it was reached via the rightmost node
+ * at each level of the tree.
+ */
+ for (i = depth-1; i >= 0; i--)
+ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+ return 0;
+out:
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ return ext4_mark_inode_dirty(handle, inode);
+}
+
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
+ * whether there are any buffers marked for delayed allocation. It returns '1'
+ * on the first delalloc'ed buffer head found. If no buffer head in the given
+ * range is marked for delalloc, it returns 0.
+ * lblk_start should always be <= lblk_end.
+ * search_hint_reverse is to indicate that searching in reverse from lblk_end to
+ * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
+ * block sooner). This is useful when blocks are truncated sequentially from
+ * lblk_start towards lblk_end.
+ */
+static int ext4_find_delalloc_range(struct inode *inode,
+ ext4_lblk_t lblk_start,
+ ext4_lblk_t lblk_end,
+ int search_hint_reverse)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct buffer_head *head, *bh = NULL;
+ struct page *page;
+ ext4_lblk_t i, pg_lblk;
+ pgoff_t index;
+
+ if (!test_opt(inode->i_sb, DELALLOC))
+ return 0;
+
+ /* reverse search wont work if fs block size is less than page size */
+ if (inode->i_blkbits < PAGE_CACHE_SHIFT)
+ search_hint_reverse = 0;
+
+ if (search_hint_reverse)
+ i = lblk_end;
+ else
+ i = lblk_start;
+
+ index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ while ((i >= lblk_start) && (i <= lblk_end)) {
+ page = find_get_page(mapping, index);
+ if (!page)
+ goto nextpage;
+
+ if (!page_has_buffers(page))
+ goto nextpage;
+
+ head = page_buffers(page);
+ if (!head)
+ goto nextpage;
+
+ bh = head;
+ pg_lblk = index << (PAGE_CACHE_SHIFT -
+ inode->i_blkbits);
+ do {
+ if (unlikely(pg_lblk < lblk_start)) {
+ /*
+ * This is possible when fs block size is less
+ * than page size and our cluster starts/ends in
+ * middle of the page. So we need to skip the
+ * initial few blocks till we reach the 'lblk'
+ */
+ pg_lblk++;
+ continue;
+ }
+
+ /* Check if the buffer is delayed allocated and that it
+ * is not yet mapped. (when da-buffers are mapped during
+ * their writeout, their da_mapped bit is set.)
+ */
+ if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
+ page_cache_release(page);
+ trace_ext4_find_delalloc_range(inode,
+ lblk_start, lblk_end,
+ search_hint_reverse,
+ 1, i);
+ return 1;
+ }
+ if (search_hint_reverse)
+ i--;
+ else
+ i++;
+ } while ((i >= lblk_start) && (i <= lblk_end) &&
+ ((bh = bh->b_this_page) != head));
+nextpage:
+ if (page)
+ page_cache_release(page);
+ /*
+ * Move to next page. 'i' will be the first lblk in the next
+ * page.
+ */
+ if (search_hint_reverse)
+ index--;
+ else
+ index++;
+ i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ }
+
+ trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+ search_hint_reverse, 0, 0);
+ return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
+ int search_hint_reverse)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t lblk_start, lblk_end;
+ lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+ lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+ return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
+ search_hint_reverse);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ * '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ * are not delalloc'ed.
+ * Ex:
+ * |----c---=|====c====|====c====|===-c----|
+ * |++++++ allocated ++++++|
+ * ==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ * marked for delayed allocation. In this case, we will exclude that
+ * cluster.
+ * Ex:
+ * |----====c========|========c========|
+ * |++++++ allocated ++++++|
+ * ==> 1 complete clusters in above example
+ *
+ * Ex:
+ * |================c================|
+ * |++++++ allocated ++++++|
+ * ==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+ unsigned int num_blks)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+ ext4_lblk_t lblk_from, lblk_to, c_offset;
+ unsigned int allocated_clusters = 0;
+
+ alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+ alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+ /* max possible clusters for this allocation */
+ allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+ trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+ /* Check towards left side */
+ c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+ if (c_offset) {
+ lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+ lblk_to = lblk_from + c_offset - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ allocated_clusters--;
+ }
+
+ /* Now check towards right. */
+ c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+ if (allocated_clusters && c_offset) {
+ lblk_from = lblk_start + num_blks;
+ lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ allocated_clusters--;
+ }
+
+ return allocated_clusters;
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+ "block %llu, max_blocks %u, flags %x, allocated %u\n",
+ inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+
+ trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
+ newblock);
+
+ /* get_block() before submit the IO, split the extent */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
+ ret = ext4_split_unwritten_extents(handle, inode, map,
+ path, flags);
+ /*
+ * Flag the inode(non aio case) or end_io struct (aio case)
+ * that this IO needs to conversion to written when IO is
+ * completed
+ */
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ if (ext4_should_dioread_nolock(inode))
+ map->m_flags |= EXT4_MAP_UNINIT;
+ goto out;
+ }
+ /* IO end_io complete, convert the filled extent to written */
+ if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+ ret = ext4_convert_unwritten_extents_endio(handle, inode,
+ path);
+ if (ret >= 0) {
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
+ } else
+ err = ret;
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ goto map_out;
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ goto out1;
+ }
+
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ map->m_flags |= EXT4_MAP_NEW;
+ /*
+ * if we allocated more blocks than requested
+ * we need to make sure we unmap the extra block
+ * allocated. The actual needed block will get
+ * unmapped later when we find the buffer_head marked
+ * new.
+ */
+ if (allocated > map->m_len) {
+ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+ newblock + map->m_len,
+ allocated - map->m_len);
+ allocated = map->m_len;
+ }
+
+ /*
+ * If we have done fallocate with the offset that is already
+ * delayed allocated, we would have block reservation
+ * and quota reservation done in the delayed write path.
+ * But fallocate would have already updated quota and block
+ * count for this offset. So cancel these reservation
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, map->m_len);
+ if (reserved_clusters)
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters,
+ 0);
+ }
+
+map_out:
+ map->m_flags |= EXT4_MAP_MAPPED;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+ err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+ map->m_len);
+ if (err < 0)
+ goto out2;
+ }
+out1:
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ ext4_ext_show_leaf(inode, path);
+ map->m_pblk = newblock;
+ map->m_len = allocated;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return err ? err : allocated;
+}
+
+/*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ * @sb The filesystem superblock structure
+ * @map The requested lblk->pblk mapping
+ * @ex The extent structure which might contain an implied
+ * cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree. Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree. There are three cases we
+ * want to catch. The first is this case:
+ *
+ * |--- cluster # N--|
+ * |--- extent ---| |---- requested region ---|
+ * |==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ * |--------- cluster # N ----------------|
+ * |--- requested region --| |------- extent ----|
+ * |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region. Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+ struct ext4_map_blocks *map,
+ struct ext4_extent *ex,
+ struct ext4_ext_path *path)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+ ext4_lblk_t ex_cluster_start, ex_cluster_end;
+ ext4_lblk_t rr_cluster_start;
+ ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+ ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+ unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+ /* The extent passed in that we are trying to match */
+ ex_cluster_start = EXT4_B2C(sbi, ee_block);
+ ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+ /* The requested region passed into ext4_map_blocks() */
+ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+
+ if ((rr_cluster_start == ex_cluster_end) ||
+ (rr_cluster_start == ex_cluster_start)) {
+ if (rr_cluster_start == ex_cluster_end)
+ ee_start += ee_len - 1;
+ map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
+ c_offset;
+ map->m_len = min(map->m_len,
+ (unsigned) sbi->s_cluster_ratio - c_offset);
+ /*
+ * Check for and handle this case:
+ *
+ * |--------- cluster # N-------------|
+ * |------- extent ----|
+ * |--- requested region ---|
+ * |===========|
+ */
+
+ if (map->m_lblk < ee_block)
+ map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+ /*
+ * Check for the case where there is already another allocated
+ * block to the right of 'ex' but before the end of the cluster.
+ *
+ * |------------- cluster # N-------------|
+ * |----- ex -----| |---- ex_right ----|
+ * |------ requested region ------|
+ * |================|
+ */
+ if (map->m_lblk > ee_block) {
+ ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+ map->m_len = min(map->m_len, next - map->m_lblk);
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+ return 1;
+ }
+
+ trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+ return 0;
+}
+
+
+/*
+ * Block allocation/map/preallocation routine for extents based files
+ *
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ *
+ * return > 0, number of of blocks already mapped/allocated
+ * if create == 0 and these are pre-allocated blocks
+ * buffer head is unmapped
+ * otherwise blocks are mapped
+ *
+ * return = 0, if plain look up failed (blocks have not been allocated)
+ * buffer head is unmapped
+ *
+ * return < 0, error case.
+ */
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent newex, *ex, *ex2;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_fsblk_t newblock = 0;
+ int free_on_err = 0, err = 0, depth, ret;
+ unsigned int allocated = 0, offset = 0;
+ unsigned int allocated_clusters = 0;
+ struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_lblk_t cluster_offset;
+
+ ext_debug("blocks %u/%u requested for inode %lu\n",
+ map->m_lblk, map->m_len, inode->i_ino);
+ trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+
+ /* check in cache */
+ if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+ if (!newex.ee_start_lo && !newex.ee_start_hi) {
+ if ((sbi->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * block isn't allocated yet and
+ * user doesn't want to allocate it
+ */
+ goto out2;
+ }
+ /* we should allocate requested block */
+ } else {
+ /* block is already allocated */
+ if (sbi->s_cluster_ratio > 1)
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ newblock = map->m_lblk
+ - le32_to_cpu(newex.ee_block)
+ + ext4_ext_pblock(&newex);
+ /* number of remaining blocks in the extent */
+ allocated = ext4_ext_get_actual_len(&newex) -
+ (map->m_lblk - le32_to_cpu(newex.ee_block));
+ goto out;
+ }
+ }
+
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out2;
+ }
+
+ depth = ext_depth(inode);
+
+ /*
+ * consistent leaf must not be empty;
+ * this situation is possible, though, _during_ tree modification;
+ * this is why assert can't be put in ext4_ext_find_extent()
+ */
+ if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+ EXT4_ERROR_INODE(inode, "bad extent address "
+ "lblock: %lu, depth: %d pblock %lld",
+ (unsigned long) map->m_lblk, depth,
+ path[depth].p_block);
+ err = -EIO;
+ goto out2;
+ }
+
+ ex = path[depth].p_ext;
+ if (ex) {
+ ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+ ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+ unsigned short ee_len;
+
+ /*
+ * Uninitialized extents are treated as holes, except that
+ * we split out initialized portions during a write.
+ */
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
+ /* if found extent covers block, simply return it */
+ if (in_range(map->m_lblk, ee_block, ee_len)) {
+ newblock = map->m_lblk - ee_block + ee_start;
+ /* number of remaining blocks in the extent */
+ allocated = ee_len - (map->m_lblk - ee_block);
+ ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+ ee_block, ee_len, newblock);
+
+ /*
+ * Do not put uninitialized extent
+ * in the cache
+ */
+ if (!ext4_ext_is_uninitialized(ex)) {
+ ext4_ext_put_in_cache(inode, ee_block,
+ ee_len, ee_start);
+ goto out;
+ }
+ ret = ext4_ext_handle_uninitialized_extents(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ return ret;
+ }
+ }
+
+ if ((sbi->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
+ /*
+ * requested block isn't allocated yet;
+ * we couldn't try to create block if create flag is zero
+ */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * put just found gap into cache to speed up
+ * subsequent requests
+ */
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ goto out2;
+ }
+
+ /*
+ * Okay, we need to do block allocation.
+ */
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+ newex.ee_block = cpu_to_le32(map->m_lblk);
+ cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+
+ /*
+ * If we are doing bigalloc, check to see if the extent returned
+ * by ext4_ext_find_extent() implies a cluster we can use.
+ */
+ if (cluster_offset && ex &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
+
+ /* find neighbour allocated blocks */
+ ar.lleft = map->m_lblk;
+ err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
+ if (err)
+ goto out2;
+ ar.lright = map->m_lblk;
+ ex2 = NULL;
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ if (err)
+ goto out2;
+
+ /* Check if the extent after searching to the right implies a
+ * cluster we can use. */
+ if ((sbi->s_cluster_ratio > 1) && ex2 &&
+ get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ ar.len = allocated = map->m_len;
+ newblock = map->m_pblk;
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ goto got_allocated_blocks;
+ }
+
+ /*
+ * See if request is beyond maximum number of blocks we can have in
+ * a single extent. For an initialized extent this limit is
+ * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
+ * EXT_UNINIT_MAX_LEN.
+ */
+ if (map->m_len > EXT_INIT_MAX_LEN &&
+ !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+ map->m_len = EXT_INIT_MAX_LEN;
+ else if (map->m_len > EXT_UNINIT_MAX_LEN &&
+ (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+ map->m_len = EXT_UNINIT_MAX_LEN;
+
+ /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+ newex.ee_len = cpu_to_le16(map->m_len);
+ err = ext4_ext_check_overlap(sbi, inode, &newex, path);
+ if (err)
+ allocated = ext4_ext_get_actual_len(&newex);
+ else
+ allocated = map->m_len;
+
+ /* allocate new block */
+ ar.inode = inode;
+ ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+ ar.logical = map->m_lblk;
+ /*
+ * We calculate the offset from the beginning of the cluster
+ * for the logical block number, since when we allocate a
+ * physical cluster, the physical block should start at the
+ * same offset from the beginning of the cluster. This is
+ * needed so that future calls to get_implied_cluster_alloc()
+ * work correctly.
+ */
+ offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+ ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+ ar.goal -= offset;
+ ar.logical -= offset;
+ if (S_ISREG(inode->i_mode))
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
+ newblock = ext4_mb_new_blocks(handle, &ar, &err);
+ if (!newblock)
+ goto out2;
+ ext_debug("allocate new block: goal %llu, found %llu/%u\n",
+ ar.goal, newblock, allocated);
+ free_on_err = 1;
+ allocated_clusters = ar.len;
+ ar.len = EXT4_C2B(sbi, ar.len) - offset;
+ if (ar.len > allocated)
+ ar.len = allocated;
+
+got_allocated_blocks:
+ /* try to insert new extent into found leaf and return */
+ ext4_ext_store_pblock(&newex, newblock + offset);
+ newex.ee_len = cpu_to_le16(ar.len);
+ /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
+ ext4_ext_mark_uninitialized(&newex);
+ /*
+ * io_end structure was created for every IO write to an
+ * uninitialized extent. To avoid unnecessary conversion,
+ * here we flag the IO that really needs the conversion.
+ * For non asycn direct IO case, flag the inode state
+ * that we need to perform conversion when IO is done.
+ */
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+ if (ext4_should_dioread_nolock(inode))
+ map->m_flags |= EXT4_MAP_UNINIT;
+ }
+
+ err = 0;
+ if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, ar.len);
+ if (!err)
+ err = ext4_ext_insert_extent(handle, inode, path,
+ &newex, flags);
+ if (err && free_on_err) {
+ int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+ EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
+ /* free data blocks we just allocated */
+ /* not a good idea to call discard here directly,
+ * but otherwise we'd need to call it every free() */
+ ext4_discard_preallocations(inode);
+ ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
+ ext4_ext_get_actual_len(&newex), fb_flags);
+ goto out2;
+ }
+
+ /* previous routine could use block we allocated */
+ newblock = ext4_ext_pblock(&newex);
+ allocated = ext4_ext_get_actual_len(&newex);
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_flags |= EXT4_MAP_NEW;
+
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ unsigned int reserved_clusters;
+ /*
+ * Check how many clusters we had reserved this allocated range
+ */
+ reserved_clusters = get_reserved_cluster_alloc(inode,
+ map->m_lblk, allocated);
+ if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (reserved_clusters) {
+ /*
+ * We have clusters reserved for this range.
+ * But since we are not doing actual allocation
+ * and are simply using blocks from previously
+ * allocated cluster, we should release the
+ * reservation and not claim quota.
+ */
+ ext4_da_update_reserve_space(inode,
+ reserved_clusters, 0);
+ }
+ } else {
+ BUG_ON(allocated_clusters < reserved_clusters);
+ /* We will claim quota for all newly allocated blocks.*/
+ ext4_da_update_reserve_space(inode, allocated_clusters,
+ 1);
+ if (reserved_clusters < allocated_clusters) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int reservation = allocated_clusters -
+ reserved_clusters;
+ /*
+ * It seems we claimed few clusters outside of
+ * the range of this allocation. We should give
+ * it back to the reservation pool. This can
+ * happen in the following case:
+ *
+ * * Suppose s_cluster_ratio is 4 (i.e., each
+ * cluster has 4 blocks. Thus, the clusters
+ * are [0-3],[4-7],[8-11]...
+ * * First comes delayed allocation write for
+ * logical blocks 10 & 11. Since there were no
+ * previous delayed allocated blocks in the
+ * range [8-11], we would reserve 1 cluster
+ * for this write.
+ * * Next comes write for logical blocks 3 to 8.
+ * In this case, we will reserve 2 clusters
+ * (for [0-3] and [4-7]; and not for [8-11] as
+ * that range has a delayed allocated blocks.
+ * Thus total reserved clusters now becomes 3.
+ * * Now, during the delayed allocation writeout
+ * time, we will first write blocks [3-8] and
+ * allocate 3 clusters for writing these
+ * blocks. Also, we would claim all these
+ * three clusters above.
+ * * Now when we come here to writeout the
+ * blocks [10-11], we would expect to claim
+ * the reservation of 1 cluster we had made
+ * (and we would claim it since there are no
+ * more delayed allocated blocks in the range
+ * [8-11]. But our reserved cluster count had
+ * already gone to 0.
+ *
+ * Thus, at the step 4 above when we determine
+ * that there are still some unwritten delayed
+ * allocated blocks outside of our current
+ * block range, we should increment the
+ * reserved clusters count so that when the
+ * remaining blocks finally gets written, we
+ * could claim them.
+ */
+ dquot_reserve_block(inode,
+ EXT4_C2B(sbi, reservation));
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks += reservation;
+ spin_unlock(&ei->i_block_reservation_lock);
+ }
+ }
+ }
+
+ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an uninitialized extent.
+ */
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
+ ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ } else
+ ext4_update_inode_fsync_trans(handle, inode, 0);
+out:
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ ext4_ext_show_leaf(inode, path);
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ map->m_len = allocated;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+
+ trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+ newblock, map->m_len, err ? err : allocated);
+
+ return err ? err : allocated;
+}
+
+void ext4_ext_truncate(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t last_block;
+ handle_t *handle;
+ loff_t page_len;
+ int err = 0;
+
+ /*
+ * finish any pending end_io work so we won't run the risk of
+ * converting any truncated blocks to initialized later
+ */
+ ext4_flush_completed_IO(inode);
+
+ /*
+ * probably first extent we're gonna free will be last in block
+ */
+ err = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, err);
+ if (IS_ERR(handle))
+ return;
+
+ if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (err)
+ goto out_stop;
+ }
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_invalidate_cache(inode);
+
+ ext4_discard_preallocations(inode);
+
+ /*
+ * TODO: optimization is possible here.
+ * Probably we need not scan at all,
+ * because page truncation is enough.
+ */
+
+ /* we have to know where to truncate from in crash case */
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_mark_inode_dirty(handle, inode);
+
+ last_block = (inode->i_size + sb->s_blocksize - 1)
+ >> EXT4_BLOCK_SIZE_BITS(sb);
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+
+ /* In a multi-transaction truncate, we only make the final
+ * transaction synchronous.
+ */
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out_stop:
+ /*
+ * If this was a simple ftruncate() and the file will remain alive,
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext4_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+}
+
+static void ext4_falloc_update_inode(struct inode *inode,
+ int mode, loff_t new_size, int update_ctime)
+{
+ struct timespec now;
+
+ if (update_ctime) {
+ now = current_fs_time(inode->i_sb);
+ if (!timespec_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+ }
+ /*
+ * Update only when preallocation was requested beyond
+ * the file size.
+ */
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (new_size > i_size_read(inode))
+ i_size_write(inode, new_size);
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if (new_size > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ }
+
+}
+
+/*
+ * preallocate space for a file. This implements ext4's fallocate file
+ * operation, which gets called from sys_fallocate system call.
+ * For block-mapped files, posix_fallocate should fall back to the method
+ * of writing zeroes to the required new blocks (the same behavior which is
+ * expected for file systems which do not support fallocate() system call).
+ */
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ handle_t *handle;
+ loff_t new_size;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ int retries = 0;
+ int flags;
+ struct ext4_map_blocks map;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ /*
+ * currently supporting (pre)allocate mode for extent-based
+ * files _only_
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
+
+ /* Return error if mode is not supported */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return ext4_punch_hole(file, offset, len);
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ map.m_lblk = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - map.m_lblk;
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+ return ret;
+ }
+ flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
+ */
+ if (len <= EXT_UNINIT_MAX_LEN << blkbits)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+retry:
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = max_blocks = max_blocks - ret;
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret <= 0) {
+#ifdef EXT4FS_DEBUG
+ WARN_ON(ret <= 0);
+ printk(KERN_ERR "%s: ext4_ext_map_blocks "
+ "returned error inode#%lu, block=%u, "
+ "max_blocks=%u", __func__,
+ inode->i_ino, map.m_lblk, max_blocks);
+#endif
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ break;
+ }
+ if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+ blkbits) >> blkbits))
+ new_size = offset + len;
+ else
+ new_size = ((loff_t) map.m_lblk + ret) << blkbits;
+
+ ext4_falloc_update_inode(inode, mode, new_size,
+ (map.m_flags & EXT4_MAP_NEW));
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret2)
+ break;
+ }
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ ret = 0;
+ goto retry;
+ }
+ mutex_unlock(&inode->i_mutex);
+ trace_ext4_fallocate_exit(inode, offset, max_blocks,
+ ret > 0 ? ret2 : ret);
+ return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ ssize_t len)
+{
+ handle_t *handle;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct ext4_map_blocks map;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ map.m_lblk = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+ map.m_lblk);
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ ret = ext4_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ if (ret <= 0) {
+ WARN_ON(ret <= 0);
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "%s:%d: inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ __func__, __LINE__, inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2 )
+ break;
+ }
+ return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
+ struct ext4_ext_cache *newex, struct ext4_extent *ex,
+ void *data)
+{
+ __u64 logical;
+ __u64 physical;
+ __u64 length;
+ __u32 flags = 0;
+ int ret = 0;
+ struct fiemap_extent_info *fieinfo = data;
+ unsigned char blksize_bits;
+
+ blksize_bits = inode->i_sb->s_blocksize_bits;
+ logical = (__u64)newex->ec_block << blksize_bits;
+
+ if (newex->ec_start == 0) {
+ /*
+ * No extent in extent-tree contains block @newex->ec_start,
+ * then the block may stay in 1)a hole or 2)delayed-extent.
+ *
+ * Holes or delayed-extents are processed as follows.
+ * 1. lookup dirty pages with specified range in pagecache.
+ * If no page is got, then there is no delayed-extent and
+ * return with EXT_CONTINUE.
+ * 2. find the 1st mapped buffer,
+ * 3. check if the mapped buffer is both in the request range
+ * and a delayed buffer. If not, there is no delayed-extent,
+ * then return.
+ * 4. a delayed-extent is found, the extent will be collected.
+ */
+ ext4_lblk_t end = 0;
+ pgoff_t last_offset;
+ pgoff_t offset;
+ pgoff_t index;
+ pgoff_t start_index = 0;
+ struct page **pages = NULL;
+ struct buffer_head *bh = NULL;
+ struct buffer_head *head = NULL;
+ unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+
+ pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (pages == NULL)
+ return -ENOMEM;
+
+ offset = logical >> PAGE_SHIFT;
+repeat:
+ last_offset = offset;
+ head = NULL;
+ ret = find_get_pages_tag(inode->i_mapping, &offset,
+ PAGECACHE_TAG_DIRTY, nr_pages, pages);
+
+ if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+ /* First time, try to find a mapped buffer. */
+ if (ret == 0) {
+out:
+ for (index = 0; index < ret; index++)
+ page_cache_release(pages[index]);
+ /* just a hole. */
+ kfree(pages);
+ return EXT_CONTINUE;
+ }
+ index = 0;
+
+next_page:
+ /* Try to find the 1st mapped buffer. */
+ end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
+ blksize_bits;
+ if (!page_has_buffers(pages[index]))
+ goto out;
+ head = page_buffers(pages[index]);
+ if (!head)
+ goto out;
+
+ index++;
+ bh = head;
+ do {
+ if (end >= newex->ec_block +
+ newex->ec_len)
+ /* The buffer is out of
+ * the request range.
+ */
+ goto out;
+
+ if (buffer_mapped(bh) &&
+ end >= newex->ec_block) {
+ start_index = index - 1;
+ /* get the 1st mapped buffer. */
+ goto found_mapped_buffer;
+ }
+
+ bh = bh->b_this_page;
+ end++;
+ } while (bh != head);
+
+ /* No mapped buffer in the range found in this page,
+ * We need to look up next page.
+ */
+ if (index >= ret) {
+ /* There is no page left, but we need to limit
+ * newex->ec_len.
+ */
+ newex->ec_len = end - newex->ec_block;
+ goto out;
+ }
+ goto next_page;
+ } else {
+ /*Find contiguous delayed buffers. */
+ if (ret > 0 && pages[0]->index == last_offset)
+ head = page_buffers(pages[0]);
+ bh = head;
+ index = 1;
+ start_index = 0;
+ }
+
+found_mapped_buffer:
+ if (bh != NULL && buffer_delay(bh)) {
+ /* 1st or contiguous delayed buffer found. */
+ if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+ /*
+ * 1st delayed buffer found, record
+ * the start of extent.
+ */
+ flags |= FIEMAP_EXTENT_DELALLOC;
+ newex->ec_block = end;
+ logical = (__u64)end << blksize_bits;
+ }
+ /* Find contiguous delayed buffers. */
+ do {
+ if (!buffer_delay(bh))
+ goto found_delayed_extent;
+ bh = bh->b_this_page;
+ end++;
+ } while (bh != head);
+
+ for (; index < ret; index++) {
+ if (!page_has_buffers(pages[index])) {
+ bh = NULL;
+ break;
+ }
+ head = page_buffers(pages[index]);
+ if (!head) {
+ bh = NULL;
+ break;
+ }
+
+ if (pages[index]->index !=
+ pages[start_index]->index + index
+ - start_index) {
+ /* Blocks are not contiguous. */
+ bh = NULL;
+ break;
+ }
+ bh = head;
+ do {
+ if (!buffer_delay(bh))
+ /* Delayed-extent ends. */
+ goto found_delayed_extent;
+ bh = bh->b_this_page;
+ end++;
+ } while (bh != head);
+ }
+ } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+ /* a hole found. */
+ goto out;
+
+found_delayed_extent:
+ newex->ec_len = min(end - newex->ec_block,
+ (ext4_lblk_t)EXT_INIT_MAX_LEN);
+ if (ret == nr_pages && bh != NULL &&
+ newex->ec_len < EXT_INIT_MAX_LEN &&
+ buffer_delay(bh)) {
+ /* Have not collected an extent and continue. */
+ for (index = 0; index < ret; index++)
+ page_cache_release(pages[index]);
+ goto repeat;
+ }
+
+ for (index = 0; index < ret; index++)
+ page_cache_release(pages[index]);
+ kfree(pages);
+ }
+
+ physical = (__u64)newex->ec_start << blksize_bits;
+ length = (__u64)newex->ec_len << blksize_bits;
+
+ if (ex && ext4_ext_is_uninitialized(ex))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+ if (next == EXT_MAX_BLOCKS)
+ flags |= FIEMAP_EXTENT_LAST;
+
+ ret = fiemap_fill_next_extent(fieinfo, logical, physical,
+ length, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ return EXT_BREAK;
+ return EXT_CONTINUE;
+}
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+static int ext4_xattr_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_LAST;
+ int blockbits = inode->i_sb->s_blocksize_bits;
+ int error = 0;
+
+ /* in-inode? */
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ struct ext4_iloc iloc;
+ int offset; /* offset of xattr in inode */
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ physical = iloc.bh->b_blocknr << blockbits;
+ offset = EXT4_GOOD_OLD_INODE_SIZE +
+ EXT4_I(inode)->i_extra_isize;
+ physical += offset;
+ length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ brelse(iloc.bh);
+ } else { /* external block */
+ physical = EXT4_I(inode)->i_file_acl << blockbits;
+ length = inode->i_sb->s_blocksize;
+ }
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ return (error < 0 ? error : 0);
+}
+
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode: The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+ int credits, err = 0;
+
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ return 0;
+
+ /*
+ * If the hole extends beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_inode_pages_range(mapping, first_page_offset,
+ last_page_offset-1);
+ }
+
+ /* finish any pending end_io work */
+ ext4_flush_completed_IO(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err)
+ goto out;
+
+ /*
+ * Now we need to zero out the non-page-aligned data in the
+ * pages at the start and tail of the hole, and unmap the buffer
+ * heads for the block aligned regions of the page that were
+ * completely zeroed.
+ */
+ if (first_page > last_page) {
+ /*
+ * If the file space being truncated is contained within a page
+ * just zero out and unmap the middle of that page
+ */
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, offset, length, 0);
+
+ if (err)
+ goto out;
+ } else {
+ /*
+ * zero out and unmap the partial page that contains
+ * the start of the hole
+ */
+ page_len = first_page_offset - offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * zero out and unmap the partial page that contains
+ * the end of the hole
+ */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ last_page_offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If i_size is contained in the last page, we need to
+ * unmap and zero the partial page after i_size
+ */
+ if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+ inode->i_size % PAGE_CACHE_SIZE != 0) {
+
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (err)
+ goto out;
+ }
+ }
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* If there are no blocks to remove, return now */
+ if (first_block >= stop_block)
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
+
+ err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
+
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+ ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ return err;
+}
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ ext4_lblk_t start_blk;
+ int error = 0;
+
+ /* fallback to generic here if not in extents fmt */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
+
+ if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+ return -EBADR;
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+ error = ext4_xattr_fiemap(inode, fieinfo);
+ } else {
+ ext4_lblk_t len_blks;
+ __u64 last_blk;
+
+ start_blk = start >> inode->i_sb->s_blocksize_bits;
+ last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+ if (last_blk >= EXT_MAX_BLOCKS)
+ last_blk = EXT_MAX_BLOCKS-1;
+ len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
+
+ /*
+ * Walk the extent tree gathering extent information.
+ * ext4_ext_fiemap_cb will push extents back to user.
+ */
+ error = ext4_ext_walk_space(inode, start_blk, len_blks,
+ ext4_ext_fiemap_cb, fieinfo);
+ }
+
+ return error;
+}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 00000000..cb70f181
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,262 @@
+/*
+ * linux/fs/ext4/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/file.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4 fs regular file handling primitives
+ *
+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
+ * (jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/quotaops.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext4_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext4_release_file(struct inode *inode, struct file *filp)
+{
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+ ext4_alloc_da_blocks(inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+ }
+ /* if we are the last writer on the inode, drop the block reservation */
+ if ((filp->f_mode & FMODE_WRITE) &&
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks)
+ {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+ if (is_dx(inode) && filp->private_data)
+ ext4_htree_free_dir_info(filp->private_data);
+
+ return 0;
+}
+
+static void ext4_aiodio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete. Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block. If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct super_block *sb = inode->i_sb;
+ int blockmask = sb->s_blocksize - 1;
+ size_t count = iov_length(iov, nr_segs);
+ loff_t final_size = pos + count;
+
+ if (pos >= inode->i_size)
+ return 0;
+
+ if ((pos & blockmask) || (final_size & blockmask))
+ return 1;
+
+ return 0;
+}
+
+static ssize_t
+ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ int unaligned_aio = 0;
+ int ret;
+
+ /*
+ * If we have encountered a bitmap-format file, the size limit
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ size_t length = iov_length(iov, nr_segs);
+
+ if ((pos > sbi->s_bitmap_maxbytes ||
+ (pos == sbi->s_bitmap_maxbytes && length > 0)))
+ return -EFBIG;
+
+ if (pos + length > sbi->s_bitmap_maxbytes) {
+ nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
+ sbi->s_bitmap_maxbytes - pos);
+ }
+ } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+ !is_sync_kiocb(iocb))) {
+ unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+ }
+
+ /* Unaligned direct AIO must be serialized; see comment above */
+ if (unaligned_aio) {
+ static unsigned long unaligned_warn_time;
+
+ /* Warn about this once per day */
+ if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+ ext4_msg(inode->i_sb, KERN_WARNING,
+ "Unaligned AIO/DIO on inode %ld by %s; "
+ "performance will be poor.",
+ inode->i_ino, current->comm);
+ mutex_lock(ext4_aio_mutex(inode));
+ ext4_aiodio_wait(inode);
+ }
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+ if (unaligned_aio)
+ mutex_unlock(ext4_aio_mutex(inode));
+
+ return ret;
+}
+
+static const struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct vfsmount *mnt = filp->f_path.mnt;
+ struct path path;
+ char buf[64], *cp;
+
+ if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+ !(sb->s_flags & MS_RDONLY))) {
+ sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ /*
+ * Sample where the filesystem has been mounted and
+ * store it in the superblock for sysadmin convenience
+ * when trying to sort through large numbers of block
+ * devices or filesystem images.
+ */
+ memset(buf, 0, sizeof(buf));
+ path.mnt = mnt;
+ path.dentry = mnt->mnt_root;
+ cp = d_path(&path, buf, sizeof(buf));
+ if (!IS_ERR(cp)) {
+ strlcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
+ ext4_mark_super_dirty(sb);
+ }
+ }
+ /*
+ * Set up the jbd2_inode if we are opening the inode for
+ * writing and the journal is present
+ */
+ if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) {
+ struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL);
+
+ spin_lock(&inode->i_lock);
+ if (!ei->jinode) {
+ if (!jinode) {
+ spin_unlock(&inode->i_lock);
+ return -ENOMEM;
+ }
+ ei->jinode = jinode;
+ jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jinode = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ if (unlikely(jinode != NULL))
+ jbd2_free_inode(jinode);
+ }
+ return dquot_file_open(inode, filp);
+}
+
+/*
+ * ext4_llseek() copied from generic_file_llseek() to handle both
+ * block-mapped and extent-mapped maxbytes values. This should
+ * otherwise be identical with generic_file_llseek().
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxbytes;
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ else
+ maxbytes = inode->i_sb->s_maxbytes;
+
+ return generic_file_llseek_size(file, offset, origin, maxbytes);
+}
+
+const struct file_operations ext4_file_operations = {
+ .llseek = ext4_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = ext4_file_write,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .mmap = ext4_file_mmap,
+ .open = ext4_file_open,
+ .release = ext4_release_file,
+ .fsync = ext4_sync_file,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+ .fallocate = ext4_fallocate,
+};
+
+const struct inode_operations ext4_file_inode_operations = {
+ .setattr = ext4_setattr,
+ .getattr = ext4_getattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .get_acl = ext4_get_acl,
+ .fiemap = ext4_fiemap,
+};
+
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 00000000..bb6c7d81
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,271 @@
+/*
+ * linux/fs/ext4/fsync.c
+ *
+ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
+ * from
+ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4fs fsync primitive
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Removed unnecessary code duplication for little endian machines
+ * and excessive __inline__s.
+ * Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/jbd2.h>
+#include <linux/blkdev.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef EXT4FS_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+ ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+int ext4_flush_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
+ int ret = 0;
+ int ret2 = 0;
+
+ dump_completed_IO(inode);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&ei->i_completed_io_list)){
+ io = list_entry(ei->i_completed_io_list.next,
+ ext4_io_end_t, list);
+ list_del_init(&io->list);
+ io->flag |= EXT4_IO_END_IN_FSYNC;
+ /*
+ * Calling ext4_end_io_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * conversion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ ret = ext4_end_io_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ io->flag &= ~EXT4_IO_END_IN_FSYNC;
+ }
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ return (ret2 < 0) ? ret2 : 0;
+}
+
+/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file. This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static int ext4_sync_parent(struct inode *inode)
+{
+ struct writeback_control wbc;
+ struct dentry *dentry = NULL;
+ struct inode *next;
+ int ret = 0;
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ return 0;
+ inode = igrab(inode);
+ while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+ dentry = NULL;
+ spin_lock(&inode->i_lock);
+ if (!list_empty(&inode->i_dentry)) {
+ dentry = list_first_entry(&inode->i_dentry,
+ struct dentry, d_alias);
+ dget(dentry);
+ }
+ spin_unlock(&inode->i_lock);
+ if (!dentry)
+ break;
+ next = igrab(dentry->d_parent->d_inode);
+ dput(dentry);
+ if (!next)
+ break;
+ iput(inode);
+ inode = next;
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (ret)
+ break;
+ memset(&wbc, 0, sizeof(wbc));
+ wbc.sync_mode = WB_SYNC_ALL;
+ wbc.nr_to_write = 0; /* only write out the inode */
+ ret = sync_inode(inode, &wbc);
+ if (ret)
+ break;
+ }
+ iput(inode);
+ return ret;
+}
+
+/**
+ * __sync_file - generic_file_fsync without the locking and filemap_write
+ * @inode: inode to sync
+ * @datasync: only sync essential metadata if true
+ *
+ * This is just generic_file_fsync without the locking. This is needed for
+ * nojournal mode to make sure this inodes data/metadata makes it to disk
+ * properly. The i_mutex should be held already.
+ */
+static int __sync_inode(struct inode *inode, int datasync)
+{
+ int err;
+ int ret;
+
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (!(inode->i_state & I_DIRTY))
+ return ret;
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ return ret;
+
+ err = sync_inode_metadata(inode, 1);
+ if (ret == 0)
+ ret = err;
+ return ret;
+}
+
+/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode. Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it. This will snapshot the
+ * inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
+ */
+
+int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ int ret;
+ tid_t commit_tid;
+ bool needs_barrier = false;
+
+ J_ASSERT(ext4_journal_current_handle() == NULL);
+
+ trace_ext4_sync_file_enter(file, datasync);
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+ mutex_lock(&inode->i_mutex);
+
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ goto out;
+
+ ret = ext4_flush_completed_IO(inode);
+ if (ret < 0)
+ goto out;
+
+ if (!journal) {
+ ret = __sync_inode(inode, datasync);
+ if (!ret && !list_empty(&inode->i_dentry))
+ ret = ext4_sync_parent(inode);
+ goto out;
+ }
+
+ /*
+ * data=writeback,ordered:
+ * The caller's filemap_fdatawrite()/wait will sync the data.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
+ *
+ * data=journal:
+ * filemap_fdatawrite won't do anything (the buffers are clean).
+ * ext4_force_commit will write the file data into the journal and
+ * will wait on that.
+ * filemap_fdatawait() will encounter a ton of newly-dirtied pages
+ * (they were dirtied by commit). But that's OK - the blocks are
+ * safe in-journal, which is all fsync() needs to ensure.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ goto out;
+ }
+
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ jbd2_log_start_commit(journal, commit_tid);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
+ if (needs_barrier)
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ out:
+ mutex_unlock(&inode->i_mutex);
+ trace_ext4_sync_file_exit(inode, ret);
+ return ret;
+}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 00000000..fa8e4911
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,208 @@
+/*
+ * linux/fs/ext4/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/cryptohash.h>
+#include "ext4.h"
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+ __u32 sum = 0;
+ __u32 b0 = buf[0], b1 = buf[1];
+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
+ int n = 16;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+ } while (--n);
+
+ buf[0] += b0;
+ buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
+{
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const unsigned char *ucp = (const unsigned char *) name;
+
+ while (len--) {
+ hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ const signed char *scp = (const signed char *) name;
+
+ while (len--) {
+ hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
+
+ if (hash & 0x80000000)
+ hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+ const signed char *scp = (const signed char *) msg;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = ((int) scp[i]) + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+ const unsigned char *ucp = (const unsigned char *) msg;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = ((int) ucp[i]) + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename. If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash. If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits. 32 bit hashes will return 0 for the minor hash.
+ */
+int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+ __u32 hash;
+ __u32 minor_hash = 0;
+ const char *p;
+ int i;
+ __u32 in[8], buf[4];
+ void (*str2hashbuf)(const char *, int, __u32 *, int) =
+ str2hashbuf_signed;
+
+ /* Initialize the default seed for the hash checksum functions */
+ buf[0] = 0x67452301;
+ buf[1] = 0xefcdab89;
+ buf[2] = 0x98badcfe;
+ buf[3] = 0x10325476;
+
+ /* Check to see if the seed is all zero's */
+ if (hinfo->seed) {
+ for (i = 0; i < 4; i++) {
+ if (hinfo->seed[i])
+ break;
+ }
+ if (i < 4)
+ memcpy(buf, hinfo->seed, sizeof(buf));
+ }
+
+ switch (hinfo->hash_version) {
+ case DX_HASH_LEGACY_UNSIGNED:
+ hash = dx_hack_hash_unsigned(name, len);
+ break;
+ case DX_HASH_LEGACY:
+ hash = dx_hack_hash_signed(name, len);
+ break;
+ case DX_HASH_HALF_MD4_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
+ case DX_HASH_HALF_MD4:
+ p = name;
+ while (len > 0) {
+ (*str2hashbuf)(p, len, in, 8);
+ half_md4_transform(buf, in);
+ len -= 32;
+ p += 32;
+ }
+ minor_hash = buf[2];
+ hash = buf[1];
+ break;
+ case DX_HASH_TEA_UNSIGNED:
+ str2hashbuf = str2hashbuf_unsigned;
+ case DX_HASH_TEA:
+ p = name;
+ while (len > 0) {
+ (*str2hashbuf)(p, len, in, 4);
+ TEA_transform(buf, in);
+ len -= 16;
+ p += 16;
+ }
+ hash = buf[0];
+ minor_hash = buf[1];
+ break;
+ default:
+ hinfo->hash = 0;
+ return -1;
+ }
+ hash = hash & ~1;
+ if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+ hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
+ hinfo->hash = hash;
+ hinfo->minor_hash = minor_hash;
+ return 0;
+}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 00000000..b4a7dd56
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,1161 @@
+/*
+ * linux/fs/ext4/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * BSD ufs-inspired inode and directory allocation by
+ * Stephen Tweedie (sct@redhat.com), 1993
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <asm/byteorder.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps. A file system contains several
+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block. Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+ int i;
+
+ if (start_bit >= end_bit)
+ return;
+
+ ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+ for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+ ext4_set_bit(i, bitmap);
+ if (i < end_bit)
+ memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/* Initializes an uninitialized inode bitmap */
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ J_ASSERT_BH(bh, buffer_locked(bh));
+
+ /* If checksum is bad mark all blocks and inodes use to prevent
+ * allocation, essentially implementing a per-group read-only flag. */
+ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
+ ext4_free_group_clusters_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
+ memset(bh->b_data, 0xff, sb->s_blocksize);
+ return 0;
+ }
+
+ memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+ bh->b_data);
+
+ return EXT4_INODES_PER_GROUP(sb);
+}
+
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+ if (uptodate) {
+ set_buffer_uptodate(bh);
+ set_bitmap_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh = NULL;
+ ext4_fsblk_t bitmap_blk;
+
+ desc = ext4_get_group_desc(sb, block_group, NULL);
+ if (!desc)
+ return NULL;
+
+ bitmap_blk = ext4_inode_bitmap(sb, desc);
+ bh = sb_getblk(sb, bitmap_blk);
+ if (unlikely(!bh)) {
+ ext4_error(sb, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
+ return NULL;
+ }
+ if (bitmap_uptodate(bh))
+ return bh;
+
+ lock_buffer(bh);
+ if (bitmap_uptodate(bh)) {
+ unlock_buffer(bh);
+ return bh;
+ }
+
+ ext4_lock_group(sb, block_group);
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ set_bitmap_uptodate(bh);
+ set_buffer_uptodate(bh);
+ ext4_unlock_group(sb, block_group);
+ unlock_buffer(bh);
+ return bh;
+ }
+ ext4_unlock_group(sb, block_group);
+
+ if (buffer_uptodate(bh)) {
+ /*
+ * if not uninit if bh is uptodate,
+ * bitmap is also uptodate
+ */
+ set_bitmap_uptodate(bh);
+ unlock_buffer(bh);
+ return bh;
+ }
+ /*
+ * submit the buffer_head for reading
+ */
+ trace_ext4_load_inode_bitmap(sb, block_group);
+ bh->b_end_io = ext4_end_bitmap_read;
+ get_bh(bh);
+ submit_bh(READ, bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ put_bh(bh);
+ ext4_error(sb, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
+ return NULL;
+ }
+ return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext4_free_inode(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ int is_directory;
+ unsigned long ino;
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *bh2;
+ ext4_group_t block_group;
+ unsigned long bit;
+ struct ext4_group_desc *gdp;
+ struct ext4_super_block *es;
+ struct ext4_sb_info *sbi;
+ int fatal = 0, err, count, cleared;
+
+ if (!sb) {
+ printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+ "nonexistent device\n", __func__, __LINE__);
+ return;
+ }
+ if (atomic_read(&inode->i_count) > 1) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+ __func__, __LINE__, inode->i_ino,
+ atomic_read(&inode->i_count));
+ return;
+ }
+ if (inode->i_nlink) {
+ ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+ __func__, __LINE__, inode->i_ino, inode->i_nlink);
+ return;
+ }
+ sbi = EXT4_SB(sb);
+
+ ino = inode->i_ino;
+ ext4_debug("freeing inode %lu\n", ino);
+ trace_ext4_free_inode(inode);
+
+ /*
+ * Note: we must free any quota before locking the superblock,
+ * as writing the quota to disk may need the lock as well.
+ */
+ dquot_initialize(inode);
+ ext4_xattr_delete_inode(handle, inode);
+ dquot_free_inode(inode);
+ dquot_drop(inode);
+
+ is_directory = S_ISDIR(inode->i_mode);
+
+ /* Do this BEFORE marking the inode not in use or returning an error */
+ ext4_clear_inode(inode);
+
+ es = EXT4_SB(sb)->s_es;
+ if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+ ext4_error(sb, "reserved or nonexistent inode %lu", ino);
+ goto error_return;
+ }
+ block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+ if (!bitmap_bh)
+ goto error_return;
+
+ BUFFER_TRACE(bitmap_bh, "get_write_access");
+ fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (fatal)
+ goto error_return;
+
+ fatal = -ESRCH;
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
+ if (gdp) {
+ BUFFER_TRACE(bh2, "get_write_access");
+ fatal = ext4_journal_get_write_access(handle, bh2);
+ }
+ ext4_lock_group(sb, block_group);
+ cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
+ if (fatal || !cleared) {
+ ext4_unlock_group(sb, block_group);
+ goto out;
+ }
+
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ percpu_counter_dec(&sbi->s_dirs_counter);
+ }
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ if (is_directory)
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+ if (cleared) {
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ ext4_mark_super_dirty(sb);
+ } else
+ ext4_error(sb, "bit already cleared for inode %lu", ino);
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, fatal);
+}
+
+struct orlov_stats {
+ __u32 free_inodes;
+ __u32 free_clusters;
+ __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg. If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
+{
+ struct ext4_group_desc *desc;
+ struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+ if (flex_size > 1) {
+ stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+ stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
+ stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ return;
+ }
+
+ desc = ext4_get_group_desc(sb, g, NULL);
+ if (desc) {
+ stats->free_inodes = ext4_free_inodes_count(sb, desc);
+ stats->free_clusters = ext4_free_group_clusters(sb, desc);
+ stats->used_dirs = ext4_used_dirs_count(sb, desc);
+ } else {
+ stats->free_inodes = 0;
+ stats->free_clusters = 0;
+ stats->used_dirs = 0;
+ }
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * Parent's group is preferred, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ */
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent,
+ ext4_group_t *group, umode_t mode,
+ const struct qstr *qstr)
+{
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t real_ngroups = ext4_get_groups_count(sb);
+ int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
+ unsigned int freei, avefreei, grp_free;
+ ext4_fsblk_t freeb, avefreec;
+ unsigned int ndirs;
+ int max_dirs, min_inodes;
+ ext4_grpblk_t min_clusters;
+ ext4_group_t i, grp, g, ngroups;
+ struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+ struct dx_hash_info hinfo;
+
+ ngroups = real_ngroups;
+ if (flex_size > 1) {
+ ngroups = (real_ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
+
+ freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+ avefreei = freei / ngroups;
+ freeb = EXT4_C2B(sbi,
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+ avefreec = freeb;
+ do_div(avefreec, ngroups);
+ ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
+ int best_ndir = inodes_per_group;
+ int ret = -1;
+
+ if (qstr) {
+ hinfo.hash_version = DX_HASH_HALF_MD4;
+ hinfo.seed = sbi->s_hash_seed;
+ ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+ grp = hinfo.hash;
+ } else
+ get_random_bytes(&grp, sizeof(grp));
+ parent_group = (unsigned)grp % ngroups;
+ for (i = 0; i < ngroups; i++) {
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
+ continue;
+ if (stats.used_dirs >= best_ndir)
+ continue;
+ if (stats.free_inodes < avefreei)
+ continue;
+ if (stats.free_clusters < avefreec)
+ continue;
+ grp = g;
+ ret = 0;
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ /*
+ * We pack inodes at the beginning of the flexgroup's
+ * inode tables. Block allocation decisions will do
+ * something similar, although regular files will
+ * start at 2nd block group of the flexgroup. See
+ * ext4_ext_find_goal() and ext4_find_near().
+ */
+ grp *= flex_size;
+ for (i = 0; i < flex_size; i++) {
+ if (grp+i >= real_ngroups)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
+ }
+ goto fallback;
+ }
+
+ max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
+
+ /*
+ * Start looking in the flex group where we last allocated an
+ * inode for this parent directory
+ */
+ if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ if (flex_size > 1)
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
+
+ for (i = 0; i < ngroups; i++) {
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
+ continue;
+ if (stats.free_inodes < min_inodes)
+ continue;
+ if (stats.free_clusters < min_clusters)
+ continue;
+ goto found_flex_bg;
+ }
+
+fallback:
+ ngroups = real_ngroups;
+ avefreei = freei / ngroups;
+fallback_retry:
+ parent_group = EXT4_I(parent)->i_block_group;
+ for (i = 0; i < ngroups; i++) {
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
+ if (desc) {
+ grp_free = ext4_free_inodes_count(sb, desc);
+ if (grp_free && grp_free >= avefreei) {
+ *group = grp;
+ return 0;
+ }
+ }
+ }
+
+ if (avefreei) {
+ /*
+ * The free-inodes counter is approximate, and for really small
+ * filesystems the above test can fail to find any blockgroups
+ */
+ avefreei = 0;
+ goto fallback_retry;
+ }
+
+ return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent,
+ ext4_group_t *group, umode_t mode)
+{
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_desc *desc;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * Try to place the inode is the same flex group as its
+ * parent. If we can't find space, use the Orlov algorithm to
+ * find another flex group, and store that information in the
+ * parent directory's inode information so that use that flex
+ * group for future allocations.
+ */
+ if (flex_size > 1) {
+ int retry = 0;
+
+ try_again:
+ parent_group &= ~(flex_size-1);
+ last = parent_group + flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ for (i = parent_group; i < last; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search algorithm
+ * to find a new flex group; we pass in the mode to
+ * avoid the topdir algorithms.
+ */
+ *group = parent_group + flex_size;
+ if (*group > ngroups)
+ *group = 0;
+ return find_group_orlov(sb, parent, group, mode, NULL);
+ }
+
+ /*
+ * Try to place the inode in its parent directory
+ */
+ *group = parent_group;
+ desc = ext4_get_group_desc(sb, *group, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_group_clusters(sb, desc))
+ return 0;
+
+ /*
+ * We're going to place this inode in a different blockgroup from its
+ * parent. We want to cause files in a common directory to all land in
+ * the same blockgroup. But we want files which are in a different
+ * directory which shares a blockgroup with our parent to land in a
+ * different blockgroup.
+ *
+ * So add our directory's i_ino into the starting point for the hash.
+ */
+ *group = (*group + parent->i_ino) % ngroups;
+
+ /*
+ * Use a quadratic hash to find a group with a free inode and some free
+ * blocks.
+ */
+ for (i = 1; i < ngroups; i <<= 1) {
+ *group += i;
+ if (*group >= ngroups)
+ *group -= ngroups;
+ desc = ext4_get_group_desc(sb, *group, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_group_clusters(sb, desc))
+ return 0;
+ }
+
+ /*
+ * That failed: try linear search for a free inode, even if that group
+ * has no free blocks.
+ */
+ *group = parent_group;
+ for (i = 0; i < ngroups; i++) {
+ if (++*group >= ngroups)
+ *group = 0;
+ desc = ext4_get_group_desc(sb, *group, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc))
+ return 0;
+ }
+
+ return -1;
+}
+
+/*
+ * There are two policies for allocating an inode. If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
+ const struct qstr *qstr, __u32 goal, uid_t *owner)
+{
+ struct super_block *sb;
+ struct buffer_head *inode_bitmap_bh = NULL;
+ struct buffer_head *group_desc_bh;
+ ext4_group_t ngroups, group = 0;
+ unsigned long ino = 0;
+ struct inode *inode;
+ struct ext4_group_desc *gdp = NULL;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ int ret2, err = 0;
+ struct inode *ret;
+ ext4_group_t i;
+ ext4_group_t flex_group;
+
+ /* Cannot create files in a deleted directory */
+ if (!dir || !dir->i_nlink)
+ return ERR_PTR(-EPERM);
+
+ sb = dir->i_sb;
+ ngroups = ext4_get_groups_count(sb);
+ trace_ext4_request_inode(dir, mode);
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ ei = EXT4_I(inode);
+ sbi = EXT4_SB(sb);
+
+ if (!goal)
+ goal = sbi->s_inode_goal;
+
+ if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+ ret2 = 0;
+ goto got_group;
+ }
+
+ if (S_ISDIR(mode))
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+ else
+ ret2 = find_group_other(sb, dir, &group, mode);
+
+got_group:
+ EXT4_I(dir)->i_last_alloc_group = group;
+ err = -ENOSPC;
+ if (ret2 == -1)
+ goto out;
+
+ /*
+ * Normally we will only go through one pass of this loop,
+ * unless we get unlucky and it turns out the group we selected
+ * had its last inode grabbed by someone else.
+ */
+ for (i = 0; i < ngroups; i++, ino = 0) {
+ err = -EIO;
+
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp)
+ goto fail;
+
+ brelse(inode_bitmap_bh);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (!inode_bitmap_bh)
+ goto fail;
+
+repeat_in_this_group:
+ ino = ext4_find_next_zero_bit((unsigned long *)
+ inode_bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb), ino);
+ if (ino >= EXT4_INODES_PER_GROUP(sb)) {
+ if (++group == ngroups)
+ group = 0;
+ continue;
+ }
+ if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+ ext4_error(sb, "reserved inode found cleared - "
+ "inode=%lu", ino + 1);
+ continue;
+ }
+ ext4_lock_group(sb, group);
+ ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+ ext4_unlock_group(sb, group);
+ ino++; /* the inode bitmap is zero-based */
+ if (!ret2)
+ goto got; /* we grabbed the inode! */
+ if (ino < EXT4_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
+ }
+ err = -ENOSPC;
+ goto out;
+
+got:
+ /* We may have to initialize the block bitmap if it isn't already */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+ gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ struct buffer_head *block_bitmap_bh;
+
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+ err = ext4_journal_get_write_access(handle, block_bitmap_bh);
+ if (err) {
+ brelse(block_bitmap_bh);
+ goto fail;
+ }
+
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+ brelse(block_bitmap_bh);
+
+ /* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+ gdp);
+ }
+ ext4_unlock_group(sb, group);
+
+ if (err)
+ goto fail;
+ }
+
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ if (err)
+ goto fail;
+
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, group_desc_bh);
+ if (err)
+ goto fail;
+
+ /* Update the relevant bg descriptor fields */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ int free;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (ino > free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ up_read(&grp->alloc_sem);
+ } else {
+ ext4_lock_group(sb, group);
+ }
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ }
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+ if (err)
+ goto fail;
+
+ BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+ if (err)
+ goto fail;
+
+ percpu_counter_dec(&sbi->s_freeinodes_counter);
+ if (S_ISDIR(mode))
+ percpu_counter_inc(&sbi->s_dirs_counter);
+ ext4_mark_super_dirty(sb);
+
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+ }
+ if (owner) {
+ inode->i_mode = mode;
+ inode->i_uid = owner[0];
+ inode->i_gid = owner[1];
+ } else if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
+ } else
+ inode_init_owner(inode, dir, mode);
+
+ inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
+ /* This is the optimal IO size (for stat), not the fs block size */
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
+ ext4_current_time(inode);
+
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ ei->i_dir_start_lookup = 0;
+ ei->i_disksize = 0;
+
+ /* Don't inherit extent flag from directory, amongst others. */
+ ei->i_flags =
+ ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
+ ei->i_file_acl = 0;
+ ei->i_dtime = 0;
+ ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;
+
+ ext4_set_inode_flags(inode);
+ if (IS_DIRSYNC(inode))
+ ext4_handle_sync(handle);
+ if (insert_inode_locked(inode) < 0) {
+ /*
+ * Likely a bitmap corruption causing inode to be allocated
+ * twice.
+ */
+ err = -EIO;
+ goto fail;
+ }
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ext4_set_inode_state(inode, EXT4_STATE_NEW);
+
+ ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+
+ ret = inode;
+ dquot_initialize(inode);
+ err = dquot_alloc_inode(inode);
+ if (err)
+ goto fail_drop;
+
+ err = ext4_init_acl(handle, inode, dir);
+ if (err)
+ goto fail_free_drop;
+
+ err = ext4_init_security(handle, inode, dir, qstr);
+ if (err)
+ goto fail_free_drop;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ /* set extent flag only for directory, file and normal symlink*/
+ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto fail_free_drop;
+ }
+
+ ext4_debug("allocating inode %lu\n", inode->i_ino);
+ trace_ext4_allocate_inode(inode, dir, mode);
+ goto really_out;
+fail:
+ ext4_std_error(sb, err);
+out:
+ iput(inode);
+ ret = ERR_PTR(err);
+really_out:
+ brelse(inode_bitmap_bh);
+ return ret;
+
+fail_free_drop:
+ dquot_free_inode(inode);
+
+fail_drop:
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ brelse(inode_bitmap_bh);
+ return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+{
+ unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+ ext4_group_t block_group;
+ int bit;
+ struct buffer_head *bitmap_bh;
+ struct inode *inode = NULL;
+ long err = -EIO;
+
+ /* Error cases - e2fsck has already cleaned up for us */
+ if (ino > max_ino) {
+ ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
+ goto error;
+ }
+
+ block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+ goto error;
+ }
+
+ /* Having the inode bit set should be a 100% indicator that this
+ * is a valid orphan (no e2fsck run on fs). Orphans also include
+ * inodes that were being truncated, so we can't check i_nlink==0.
+ */
+ if (!ext4_test_bit(bit, bitmap_bh->b_data))
+ goto bad_orphan;
+
+ inode = ext4_iget(sb, ino);
+ if (IS_ERR(inode))
+ goto iget_failed;
+
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
+ if (NEXT_ORPHAN(inode) > max_ino)
+ goto bad_orphan;
+ brelse(bitmap_bh);
+ return inode;
+
+iget_failed:
+ err = PTR_ERR(inode);
+ inode = NULL;
+bad_orphan:
+ ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
+ printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ bit, (unsigned long long)bitmap_bh->b_blocknr,
+ ext4_test_bit(bit, bitmap_bh->b_data));
+ printk(KERN_NOTICE "inode=%p\n", inode);
+ if (inode) {
+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ is_bad_inode(inode));
+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ NEXT_ORPHAN(inode));
+ printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
+ /* Avoid freeing blocks if we got a bad deleted inode */
+ if (inode->i_nlink == 0)
+ inode->i_blocks = 0;
+ iput(inode);
+ }
+ brelse(bitmap_bh);
+error:
+ return ERR_PTR(err);
+}
+
+unsigned long ext4_count_free_inodes(struct super_block *sb)
+{
+ unsigned long desc_count;
+ struct ext4_group_desc *gdp;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+#ifdef EXT4FS_DEBUG
+ struct ext4_super_block *es;
+ unsigned long bitmap_count, x;
+ struct buffer_head *bitmap_bh = NULL;
+
+ es = EXT4_SB(sb)->s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += ext4_free_inodes_count(sb, gdp);
+ brelse(bitmap_bh);
+ bitmap_bh = ext4_read_inode_bitmap(sb, i);
+ if (!bitmap_bh)
+ continue;
+
+ x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+ printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+ (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
+ bitmap_count += x;
+ }
+ brelse(bitmap_bh);
+ printk(KERN_DEBUG "ext4_count_free_inodes: "
+ "stored = %u, computed = %lu, %lu\n",
+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+ return desc_count;
+#else
+ desc_count = 0;
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += ext4_free_inodes_count(sb, gdp);
+ cond_resched();
+ }
+ return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext4_count_dirs(struct super_block * sb)
+{
+ unsigned long count = 0;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+
+ for (i = 0; i < ngroups; i++) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ count += ext4_used_dirs_count(sb, gdp);
+ }
+ return count;
+}
+
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_new_inode() until we are finished.
+ */
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+ int barrier)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *group_desc_bh;
+ handle_t *handle;
+ ext4_fsblk_t blk;
+ int num, ret = 0, used_blks = 0;
+
+ /* This should not happen, but just to be sure check this */
+ if (sb->s_flags & MS_RDONLY) {
+ ret = 1;
+ goto out;
+ }
+
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp)
+ goto out;
+
+ /*
+ * We do not need to lock this, because we are the only one
+ * handling this flag.
+ */
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+ goto out;
+
+ handle = ext4_journal_start_sb(sb, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ down_write(&grp->alloc_sem);
+ /*
+ * If inode bitmap was already initialized there may be some
+ * used inodes so we need to skip blocks with used inodes in
+ * inode table.
+ */
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+ used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp)),
+ sbi->s_inodes_per_block);
+
+ if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+ ext4_error(sb, "Something is wrong with group %u: "
+ "used itable blocks: %d; "
+ "itable unused count: %u",
+ group, used_blks,
+ ext4_itable_unused_count(sb, gdp));
+ ret = 1;
+ goto err_out;
+ }
+
+ blk = ext4_inode_table(sb, gdp) + used_blks;
+ num = sbi->s_itb_per_group - used_blks;
+
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ ret = ext4_journal_get_write_access(handle,
+ group_desc_bh);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Skip zeroout if the inode table is full. But we set the ZEROED
+ * flag anyway, because obviously, when it is full it does not need
+ * further zeroing.
+ */
+ if (unlikely(num == 0))
+ goto skip_zeroout;
+
+ ext4_debug("going to zero out inode table in group %d\n",
+ group);
+ ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+ if (ret < 0)
+ goto err_out;
+ if (barrier)
+ blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+ ext4_lock_group(sb, group);
+ gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+ ext4_unlock_group(sb, group);
+
+ BUFFER_TRACE(group_desc_bh,
+ "call ext4_handle_dirty_metadata");
+ ret = ext4_handle_dirty_metadata(handle, NULL,
+ group_desc_bh);
+
+err_out:
+ up_write(&grp->alloc_sem);
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 00000000..830e1b2b
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1502 @@
+/*
+ * linux/fs/ext4/indirect.c
+ *
+ * from
+ *
+ * linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Goal-directed block allocation by Stephen Tweedie
+ * (sct@redhat.com), 1993, 1998
+ */
+
+#include "ext4_jbd2.h"
+#include "truncate.h"
+
+#include <trace/events/ext4.h>
+
+typedef struct {
+ __le32 *p;
+ __le32 key;
+ struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+ p->key = *(p->p = v);
+ p->bh = bh;
+}
+
+/**
+ * ext4_block_to_path - parse the block number into array of offsets
+ * @inode: inode in question (we are only interested in its superblock)
+ * @i_block: block number to be parsed
+ * @offsets: array to store the offsets in
+ * @boundary: set this non-zero if the referred-to block is likely to be
+ * followed (on disk) by an indirect block.
+ *
+ * To store the locations of file's data ext4 uses a data structure common
+ * for UNIX filesystems - tree of pointers anchored in the inode, with
+ * data blocks at leaves and indirect blocks in intermediate nodes.
+ * This function translates the block number into path in that tree -
+ * return value is the path length and @offsets[n] is the offset of
+ * pointer to (n+1)th node in the nth one. If @block is out of range
+ * (negative or too large) warning is printed and zero returned.
+ *
+ * Note: function doesn't find node addresses, so no IO is needed. All
+ * we need to know is the capacity of indirect blocks (taken from the
+ * inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+ ext4_lblk_t i_block,
+ ext4_lblk_t offsets[4], int *boundary)
+{
+ int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+ const long direct_blocks = EXT4_NDIR_BLOCKS,
+ indirect_blocks = ptrs,
+ double_blocks = (1 << (ptrs_bits * 2));
+ int n = 0;
+ int final = 0;
+
+ if (i_block < direct_blocks) {
+ offsets[n++] = i_block;
+ final = direct_blocks;
+ } else if ((i_block -= direct_blocks) < indirect_blocks) {
+ offsets[n++] = EXT4_IND_BLOCK;
+ offsets[n++] = i_block;
+ final = ptrs;
+ } else if ((i_block -= indirect_blocks) < double_blocks) {
+ offsets[n++] = EXT4_DIND_BLOCK;
+ offsets[n++] = i_block >> ptrs_bits;
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+ offsets[n++] = EXT4_TIND_BLOCK;
+ offsets[n++] = i_block >> (ptrs_bits * 2);
+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else {
+ ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+ i_block + direct_blocks +
+ indirect_blocks + double_blocks, inode->i_ino);
+ }
+ if (boundary)
+ *boundary = final - 1 - (i_block & (ptrs - 1));
+ return n;
+}
+
+/**
+ * ext4_get_branch - read the chain of indirect blocks leading to data
+ * @inode: inode in question
+ * @depth: depth of the chain (1 - direct pointer, etc.)
+ * @offsets: offsets of pointers in inode/indirect blocks
+ * @chain: place to store the result
+ * @err: here we store the error value
+ *
+ * Function fills the array of triples <key, p, bh> and returns %NULL
+ * if everything went OK or the pointer to the last filled triple
+ * (incomplete one) otherwise. Upon the return chain[i].key contains
+ * the number of (i+1)-th block in the chain (as it is stored in memory,
+ * i.e. little-endian 32-bit), chain[i].p contains the address of that
+ * number (it points into struct inode for i==0 and into the bh->b_data
+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ * block for i>0 and NULL for i==0. In other words, it holds the block
+ * numbers of the chain, addresses they were taken from (and where we can
+ * verify that chain did not change) and buffer_heads hosting these
+ * numbers.
+ *
+ * Function stops when it stumbles upon zero pointer (absent block)
+ * (pointer to last triple returned, *@err == 0)
+ * or when it gets an IO error reading an indirect block
+ * (ditto, *@err == -EIO)
+ * or when it reads all @depth-1 indirect blocks successfully and finds
+ * the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ * Need to be called with
+ * down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+ ext4_lblk_t *offsets,
+ Indirect chain[4], int *err)
+{
+ struct super_block *sb = inode->i_sb;
+ Indirect *p = chain;
+ struct buffer_head *bh;
+
+ *err = 0;
+ /* i_data is not going away, no lock needed */
+ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+ if (!p->key)
+ goto no_block;
+ while (--depth) {
+ bh = sb_getblk(sb, le32_to_cpu(p->key));
+ if (unlikely(!bh))
+ goto failure;
+
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto failure;
+ }
+ /* validate block references */
+ if (ext4_check_indirect_blockref(inode, bh)) {
+ put_bh(bh);
+ goto failure;
+ }
+ }
+
+ add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+ /* Reader: end */
+ if (!p->key)
+ goto no_block;
+ }
+ return NULL;
+
+failure:
+ *err = -EIO;
+no_block:
+ return p;
+}
+
+/**
+ * ext4_find_near - find a place for allocation with sufficient locality
+ * @inode: owner
+ * @ind: descriptor of indirect block.
+ *
+ * This function returns the preferred place for block allocation.
+ * It is used when heuristic for sequential allocation fails.
+ * Rules are:
+ * + if there is a block to the left of our position - allocate near it.
+ * + if pointer will live in indirect block - allocate near that block.
+ * + if pointer will live in inode - allocate in the same
+ * cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group. The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ * Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+ __le32 *p;
+
+ /* Try to find previous block */
+ for (p = ind->p - 1; p >= start; p--) {
+ if (*p)
+ return le32_to_cpu(*p);
+ }
+
+ /* No such thing, so let's try location of indirect block */
+ if (ind->bh)
+ return ind->bh->b_blocknr;
+
+ /*
+ * It is going to be referred to from the inode itself? OK, just put it
+ * into the same cylinder group then.
+ */
+ return ext4_inode_to_goal_block(inode);
+}
+
+/**
+ * ext4_find_goal - find a preferred place for allocation.
+ * @inode: owner
+ * @block: block we want
+ * @partial: pointer to the last triple within a chain
+ *
+ * Normally this function find the preferred place for block allocation,
+ * returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+ Indirect *partial)
+{
+ ext4_fsblk_t goal;
+
+ /*
+ * XXX need to get goal block from mballoc's data structures
+ */
+
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
+}
+
+/**
+ * ext4_blks_to_allocate - Look up the block map and count the number
+ * of direct blocks need to be allocated for the given branch.
+ *
+ * @branch: chain of indirect blocks
+ * @k: number of blocks need for indirect blocks
+ * @blks: number of data blocks to be mapped.
+ * @blocks_to_boundary: the offset in the indirect block
+ *
+ * return the total number of blocks to be allocate, including the
+ * direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+ int blocks_to_boundary)
+{
+ unsigned int count = 0;
+
+ /*
+ * Simple case, [t,d]Indirect block(s) has not allocated yet
+ * then it's clear blocks on that path have not allocated
+ */
+ if (k > 0) {
+ /* right now we don't handle cross boundary allocation */
+ if (blks < blocks_to_boundary + 1)
+ count += blks;
+ else
+ count += blocks_to_boundary + 1;
+ return count;
+ }
+
+ count++;
+ while (count < blks && count <= blocks_to_boundary &&
+ le32_to_cpu(*(branch[0].p + count)) == 0) {
+ count++;
+ }
+ return count;
+}
+
+/**
+ * ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ * @handle: handle for this transaction
+ * @inode: inode which needs allocated blocks
+ * @iblock: the logical block to start allocated at
+ * @goal: preferred physical block of allocation
+ * @indirect_blks: the number of blocks need to allocate for indirect
+ * blocks
+ * @blks: number of desired blocks
+ * @new_blocks: on return it will store the new block numbers for
+ * the indirect blocks(if needed) and the first direct block,
+ * @err: on return it will store the error code
+ *
+ * This function will return the number of blocks allocated as
+ * requested by the passed-in parameters.
+ */
+static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
+{
+ struct ext4_allocation_request ar;
+ int target, i;
+ unsigned long count = 0, blk_allocated = 0;
+ int index = 0;
+ ext4_fsblk_t current_block = 0;
+ int ret = 0;
+
+ /*
+ * Here we try to allocate the requested multiple blocks at once,
+ * on a best-effort basis.
+ * To build a branch, we should allocate blocks for
+ * the indirect blocks(if not allocated yet), and at least
+ * the first direct block of this branch. That's the
+ * minimum number of blocks need to allocate(required)
+ */
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
+ count = target;
+ /* allocating blocks for indirect blocks and direct blocks */
+ current_block = ext4_new_meta_blocks(handle, inode, goal,
+ 0, &count, err);
+ if (*err)
+ goto failed_out;
+
+ if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+ EXT4_ERROR_INODE(inode,
+ "current_block %llu + count %lu > %d!",
+ current_block, count,
+ EXT4_MAX_BLOCK_FILE_PHYS);
+ *err = -EIO;
+ goto failed_out;
+ }
+
+ target -= count;
+ /* allocate blocks for indirect blocks */
+ while (index < indirect_blks && count) {
+ new_blocks[index++] = current_block++;
+ count--;
+ }
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
+ break;
+ }
+ }
+
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ memset(&ar, 0, sizeof(ar));
+ ar.inode = inode;
+ ar.goal = goal;
+ ar.len = target;
+ ar.logical = iblock;
+ if (S_ISREG(inode->i_mode))
+ /* enable in-core preallocation only for regular files */
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ current_block = ext4_mb_new_blocks(handle, &ar, err);
+ if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+ EXT4_ERROR_INODE(inode,
+ "current_block %llu + ar.len %d > %d!",
+ current_block, ar.len,
+ EXT4_MAX_BLOCK_FILE_PHYS);
+ *err = -EIO;
+ goto failed_out;
+ }
+
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += ar.len;
+ }
+allocated:
+ /* total number of blocks allocated for direct blocks */
+ ret = blk_allocated;
+ *err = 0;
+ return ret;
+failed_out:
+ for (i = 0; i < index; i++)
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
+ return ret;
+}
+
+/**
+ * ext4_alloc_branch - allocate and set up a chain of blocks.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @indirect_blks: number of allocated indirect blocks
+ * @blks: number of allocated direct blocks
+ * @goal: preferred place for allocation
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
+ *
+ * This function allocates blocks, zeroes out all but the last one,
+ * links them into chain and (if we are synchronous) writes them to disk.
+ * In other words, it prepares a branch that can be spliced onto the
+ * inode. It stores the information about that chain in the branch[], in
+ * the same format as ext4_get_branch() would do. We are calling it after
+ * we had read the existing part of chain and partial points to the last
+ * triple of that (one with zero ->key). Upon the exit we have the same
+ * picture as after the successful ext4_get_block(), except that in one
+ * place chain is disconnected - *branch->p is still zero (we did not
+ * set the last link), but branch->key contains the number that should
+ * be placed into *branch->p to fill that gap.
+ *
+ * If allocation fails we free all blocks we've allocated (and forget
+ * their buffer_heads) and return the error value the from failed
+ * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
+{
+ int blocksize = inode->i_sb->s_blocksize;
+ int i, n = 0;
+ int err = 0;
+ struct buffer_head *bh;
+ int num;
+ ext4_fsblk_t new_blocks[4];
+ ext4_fsblk_t current_block;
+
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
+ *blks, new_blocks, &err);
+ if (err)
+ return err;
+
+ branch[0].key = cpu_to_le32(new_blocks[0]);
+ /*
+ * metadata blocks and data blocks are allocated.
+ */
+ for (n = 1; n <= indirect_blks; n++) {
+ /*
+ * Get buffer_head for parent block, zero it out
+ * and set the pointer to new one, then send
+ * parent to disk.
+ */
+ bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -EIO;
+ goto failed;
+ }
+
+ branch[n].bh = bh;
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err) {
+ /* Don't brelse(bh) here; it's done in
+ * ext4_journal_forget() below */
+ unlock_buffer(bh);
+ goto failed;
+ }
+
+ memset(bh->b_data, 0, blocksize);
+ branch[n].p = (__le32 *) bh->b_data + offsets[n];
+ branch[n].key = cpu_to_le32(new_blocks[n]);
+ *branch[n].p = branch[n].key;
+ if (n == indirect_blks) {
+ current_block = new_blocks[n];
+ /*
+ * End of chain, update the last new metablock of
+ * the chain to point to the new allocated
+ * data blocks numbers
+ */
+ for (i = 1; i < num; i++)
+ *(branch[n].p + i) = cpu_to_le32(++current_block);
+ }
+ BUFFER_TRACE(bh, "marking uptodate");
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (err)
+ goto failed;
+ }
+ *blks = num;
+ return err;
+failed:
+ /* Allocation failed, free what we already allocated */
+ ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
+ for (i = 1; i <= n ; i++) {
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+ for (i = n+1; i < indirect_blks; i++)
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
+
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
+
+ return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ * ext4_alloc_branch)
+ * @where: location of missing link
+ * @num: number of indirect blocks we are adding
+ * @blks: number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ ext4_lblk_t block, Indirect *where, int num,
+ int blks)
+{
+ int i;
+ int err = 0;
+ ext4_fsblk_t current_block;
+
+ /*
+ * If we're splicing into a [td]indirect block (as opposed to the
+ * inode) then we need to get write access to the [td]indirect block
+ * before the splice.
+ */
+ if (where->bh) {
+ BUFFER_TRACE(where->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, where->bh);
+ if (err)
+ goto err_out;
+ }
+ /* That's it */
+
+ *where->p = where->key;
+
+ /*
+ * Update the host buffer_head or inode to point to more just allocated
+ * direct blocks blocks
+ */
+ if (num == 0 && blks > 1) {
+ current_block = le32_to_cpu(where->key) + 1;
+ for (i = 1; i < blks; i++)
+ *(where->p + i) = cpu_to_le32(current_block++);
+ }
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+ /* had we spliced it onto indirect block? */
+ if (where->bh) {
+ /*
+ * If we spliced it onto an indirect block, we haven't
+ * altered the inode. Note however that if it is being spliced
+ * onto an indirect block at the very end of the file (the
+ * file is growing) then we *will* alter the inode to reflect
+ * the new i_size. But that is not done here - it is done in
+ * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+ */
+ jbd_debug(5, "splicing indirect only\n");
+ BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+ if (err)
+ goto err_out;
+ } else {
+ /*
+ * OK, we spliced it into the inode itself on a direct block.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ jbd_debug(5, "splicing direct\n");
+ }
+ return err;
+
+err_out:
+ for (i = 1; i <= num; i++) {
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+ blks, 0);
+
+ return err;
+}
+
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ int flags)
+{
+ int err = -EIO;
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ ext4_fsblk_t goal;
+ int indirect_blks;
+ int blocks_to_boundary = 0;
+ int depth;
+ int count = 0;
+ ext4_fsblk_t first_block = 0;
+
+ trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+ J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+ depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+ &blocks_to_boundary);
+
+ if (depth == 0)
+ goto out;
+
+ partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+ /* Simplest case - block found, no allocation needed */
+ if (!partial) {
+ first_block = le32_to_cpu(chain[depth - 1].key);
+ count++;
+ /*map more blocks*/
+ while (count < map->m_len && count <= blocks_to_boundary) {
+ ext4_fsblk_t blk;
+
+ blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+ if (blk == first_block + count)
+ count++;
+ else
+ break;
+ }
+ goto got_it;
+ }
+
+ /* Next simple case - plain lookup or failed read of indirect block */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+ goto cleanup;
+
+ /*
+ * Okay, we need to do block allocation.
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+ "non-extent mapped inodes with bigalloc");
+ return -ENOSPC;
+ }
+
+ goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+ /* the number of blocks need to allocate for [d,t]indirect blocks */
+ indirect_blks = (chain + depth) - partial - 1;
+
+ /*
+ * Next look up the indirect map to count the totoal number of
+ * direct blocks to allocate for this branch.
+ */
+ count = ext4_blks_to_allocate(partial, indirect_blks,
+ map->m_len, blocks_to_boundary);
+ /*
+ * Block out ext4_truncate while we alter the tree
+ */
+ err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
+
+ /*
+ * The ext4_splice_branch call will free and forget any buffers
+ * on the new chain if there is a failure, but that risks using
+ * up transaction credits, especially for bitmaps where the
+ * credits cannot be returned. Can we handle this somehow? We
+ * may need to return -EAGAIN upwards in the worst case. --sct
+ */
+ if (!err)
+ err = ext4_splice_branch(handle, inode, map->m_lblk,
+ partial, indirect_blks, count);
+ if (err)
+ goto cleanup;
+
+ map->m_flags |= EXT4_MAP_NEW;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+got_it:
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = le32_to_cpu(chain[depth-1].key);
+ map->m_len = count;
+ if (count > blocks_to_boundary)
+ map->m_flags |= EXT4_MAP_BOUNDARY;
+ err = count;
+ /* Clean up and exit */
+ partial = chain + depth - 1; /* the whole chain */
+cleanup:
+ while (partial > chain) {
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+out:
+ trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+ map->m_pblk, map->m_len, err);
+ return err;
+}
+
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle;
+ ssize_t ret;
+ int orphan = 0;
+ size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
+
+ if (rw == WRITE) {
+ loff_t final_size = offset + count;
+
+ if (final_size > inode->i_size) {
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ orphan = 1;
+ ei->i_disksize = inode->i_size;
+ ext4_journal_stop(handle);
+ }
+ }
+
+retry:
+ if (rw == READ && ext4_should_dioread_nolock(inode)) {
+ if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+ mutex_lock(&inode->i_mutex);
+ ext4_flush_completed_IO(inode);
+ mutex_unlock(&inode->i_mutex);
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block, NULL, NULL, 0);
+ } else {
+ ret = blockdev_direct_IO(rw, iocb, inode, iov,
+ offset, nr_segs, ext4_get_block);
+
+ if (unlikely((rw & WRITE) && ret < 0)) {
+ loff_t isize = i_size_read(inode);
+ loff_t end = offset + iov_length(iov, nr_segs);
+
+ if (end > isize)
+ ext4_truncate_failed_write(inode);
+ }
+ }
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ if (orphan) {
+ int err;
+
+ /* Credits for sb + inode write */
+ handle = ext4_journal_start(inode, 2);
+ if (IS_ERR(handle)) {
+ /* This is really bad luck. We've written the data
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ goto out;
+ }
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ if (ret > 0) {
+ loff_t end = offset + ret;
+ if (end > inode->i_size) {
+ ei->i_disksize = end;
+ i_size_write(inode, end);
+ /*
+ * We're going to return a positive `ret'
+ * here due to non-zero-length I/O, so there's
+ * no way of reporting error returns from
+ * ext4_mark_inode_dirty() to userspace. So
+ * ignore it.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ err = ext4_journal_stop(handle);
+ if (ret == 0)
+ ret = err;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+ int blk_bits;
+
+ if (lblock < EXT4_NDIR_BLOCKS)
+ return 0;
+
+ lblock -= EXT4_NDIR_BLOCKS;
+
+ if (ei->i_da_metadata_calc_len &&
+ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+ ei->i_da_metadata_calc_len++;
+ return 0;
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+ blk_bits = order_base_2(lblock);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ int indirects;
+
+ /* if nrblocks are contiguous */
+ if (chunk) {
+ /*
+ * With N contiguous data blocks, we need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+ * 2 dindirect blocks, and 1 tindirect block
+ */
+ return DIV_ROUND_UP(nrblocks,
+ EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+ }
+ /*
+ * if nrblocks are not contiguous, worse case, each block touch
+ * a indirect block, and each indirect block touch a double indirect
+ * block, plus a triple indirect block
+ */
+ indirects = nrblocks * 2 + 1;
+ return indirects;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge. So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit. If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+ handle_t *result;
+
+ result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
+ if (!IS_ERR(result))
+ return result;
+
+ ext4_std_error(inode->i_sb, PTR_ERR(result));
+ return result;
+}
+
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room. If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+ return 0;
+ if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
+ return 0;
+ return 1;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+ while (p < q)
+ if (*p++)
+ return 0;
+ return 1;
+}
+
+/**
+ * ext4_find_shared - find the indirect blocks for partial truncation.
+ * @inode: inode in question
+ * @depth: depth of the affected branch
+ * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ * @chain: place to store the pointers to partial indirect blocks
+ * @top: place to the (detached) top of branch
+ *
+ * This is a helper function used by ext4_truncate().
+ *
+ * When we do truncate() we may have to clean the ends of several
+ * indirect blocks but leave the blocks themselves alive. Block is
+ * partially truncated if some data below the new i_size is referred
+ * from it (and it is on the path to the first completely truncated
+ * data block, indeed). We have to free the top of that path along
+ * with everything to the right of the path. Since no allocation
+ * past the truncation point is possible until ext4_truncate()
+ * finishes, we may safely do the latter, but top of branch may
+ * require special attention - pageout below the truncation point
+ * might try to populate it.
+ *
+ * We atomically detach the top of branch from the tree, store the
+ * block number of its root in *@top, pointers to buffer_heads of
+ * partially truncated blocks - in @chain[].bh and pointers to
+ * their last elements that should not be removed - in
+ * @chain[].p. Return value is the pointer to last filled element
+ * of @chain.
+ *
+ * The work left to caller to do the actual freeing of subtrees:
+ * a) free the subtree starting from *@top
+ * b) free the subtrees whose roots are stored in
+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ * c) free the subtrees growing from the inode past the @chain[0].
+ * (no partially truncated stuff there). */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+ ext4_lblk_t offsets[4], Indirect chain[4],
+ __le32 *top)
+{
+ Indirect *partial, *p;
+ int k, err;
+
+ *top = 0;
+ /* Make k index the deepest non-null offset + 1 */
+ for (k = depth; k > 1 && !offsets[k-1]; k--)
+ ;
+ partial = ext4_get_branch(inode, k, offsets, chain, &err);
+ /* Writer: pointers */
+ if (!partial)
+ partial = chain + k-1;
+ /*
+ * If the branch acquired continuation since we've looked at it -
+ * fine, it should all survive and (new) top doesn't belong to us.
+ */
+ if (!partial->key && *partial->p)
+ /* Writer: end */
+ goto no_top;
+ for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+ ;
+ /*
+ * OK, we've found the last block that must survive. The rest of our
+ * branch should be detached before unlocking. However, if that rest
+ * of branch is all ours and does not grow immediately from the inode
+ * it's easier to cheat and just decrement partial->p.
+ */
+ if (p == chain + k - 1 && p > chain) {
+ p->p--;
+ } else {
+ *top = *p->p;
+ /* Nope, don't do this in ext4. Must leave the tree intact */
+#if 0
+ *p->p = 0;
+#endif
+ }
+ /* Writer: end */
+
+ while (partial > p) {
+ brelse(partial->bh);
+ partial--;
+ }
+no_top:
+ return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh,
+ ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first,
+ __le32 *last)
+{
+ __le32 *p;
+ int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+ int err;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+ count)) {
+ EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+ "blocks %llu len %lu",
+ (unsigned long long) block_to_free, count);
+ return 1;
+ }
+
+ if (try_to_extend_transaction(handle, inode)) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ goto out_err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ goto out_err;
+ err = ext4_truncate_restart_trans(handle, inode,
+ ext4_blocks_for_truncate(inode));
+ if (unlikely(err))
+ goto out_err;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out_err;
+ }
+ }
+
+ for (p = first; p < last; p++)
+ *p = 0;
+
+ ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+ return 0;
+out_err:
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle: handle for this transaction
+ * @inode: inode we are dealing with
+ * @this_bh: indirect buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free. Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *this_bh,
+ __le32 *first, __le32 *last)
+{
+ ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
+ unsigned long count = 0; /* Number of blocks in the run */
+ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
+ corresponding to
+ block_to_free */
+ ext4_fsblk_t nr; /* Current block # */
+ __le32 *p; /* Pointer into inode/ind
+ for current block */
+ int err = 0;
+
+ if (this_bh) { /* For indirect block */
+ BUFFER_TRACE(this_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, this_bh);
+ /* Important: if we can't update the indirect pointers
+ * to the blocks, we can't free them. */
+ if (err)
+ return;
+ }
+
+ for (p = first; p < last; p++) {
+ nr = le32_to_cpu(*p);
+ if (nr) {
+ /* accumulate blocks to free if they're contiguous */
+ if (count == 0) {
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ } else if (nr == block_to_free + count) {
+ count++;
+ } else {
+ err = ext4_clear_blocks(handle, inode, this_bh,
+ block_to_free, count,
+ block_to_free_p, p);
+ if (err)
+ break;
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ }
+ }
+ }
+
+ if (!err && count > 0)
+ err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+ count, block_to_free_p, p);
+ if (err < 0)
+ /* fatal error */
+ return;
+
+ if (this_bh) {
+ BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+ ext4_handle_dirty_metadata(handle, inode, this_bh);
+ else
+ EXT4_ERROR_INODE(inode,
+ "circular indirect block detected at "
+ "block %llu",
+ (unsigned long long) this_bh->b_blocknr);
+ }
+}
+
+/**
+ * ext4_free_branches - free an array of branches
+ * @handle: JBD handle for this transaction
+ * @inode: inode we are dealing with
+ * @parent_bh: the buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: pointer immediately past the end of array
+ * @depth: depth of the branches to free
+ *
+ * We are freeing all blocks referred from these branches (numbers are
+ * stored as little-endian 32-bit) and updating @inode->i_blocks
+ * appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh,
+ __le32 *first, __le32 *last, int depth)
+{
+ ext4_fsblk_t nr;
+ __le32 *p;
+
+ if (ext4_handle_is_aborted(handle))
+ return;
+
+ if (depth--) {
+ struct buffer_head *bh;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ p = last;
+ while (--p >= first) {
+ nr = le32_to_cpu(*p);
+ if (!nr)
+ continue; /* A hole */
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ nr, 1)) {
+ EXT4_ERROR_INODE(inode,
+ "invalid indirect mapped "
+ "block %lu (level %d)",
+ (unsigned long) nr, depth);
+ break;
+ }
+
+ /* Go read the buffer for the next level down */
+ bh = sb_bread(inode->i_sb, nr);
+
+ /*
+ * A read failure? Report error and clear slot
+ * (should be rare).
+ */
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, nr,
+ "Read failure");
+ continue;
+ }
+
+ /* This zaps the entire block. Bottom up. */
+ BUFFER_TRACE(bh, "free child branches");
+ ext4_free_branches(handle, inode, bh,
+ (__le32 *) bh->b_data,
+ (__le32 *) bh->b_data + addr_per_block,
+ depth);
+ brelse(bh);
+
+ /*
+ * Everything below this this pointer has been
+ * released. Now let this top-of-subtree go.
+ *
+ * We want the freeing of this indirect block to be
+ * atomic in the journal with the updating of the
+ * bitmap block which owns it. So make some room in
+ * the journal.
+ *
+ * We zero the parent pointer *after* freeing its
+ * pointee in the bitmaps, so if extend_transaction()
+ * for some reason fails to put the bitmap changes and
+ * the release into the same transaction, recovery
+ * will merely complain about releasing a free block,
+ * rather than leaking blocks.
+ */
+ if (ext4_handle_is_aborted(handle))
+ return;
+ if (try_to_extend_transaction(handle, inode)) {
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ ext4_blocks_for_truncate(inode));
+ }
+
+ /*
+ * The forget flag here is critical because if
+ * we are journaling (and not doing data
+ * journaling), we have to make sure a revoke
+ * record is written to prevent the journal
+ * replay from overwriting the (former)
+ * indirect block if it gets reallocated as a
+ * data block. This must happen in the same
+ * transaction where the data blocks are
+ * actually freed.
+ */
+ ext4_free_blocks(handle, inode, NULL, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA|
+ EXT4_FREE_BLOCKS_FORGET);
+
+ if (parent_bh) {
+ /*
+ * The block which we have just freed is
+ * pointed to by an indirect block: journal it
+ */
+ BUFFER_TRACE(parent_bh, "get_write_access");
+ if (!ext4_journal_get_write_access(handle,
+ parent_bh)){
+ *p = 0;
+ BUFFER_TRACE(parent_bh,
+ "call ext4_handle_dirty_metadata");
+ ext4_handle_dirty_metadata(handle,
+ inode,
+ parent_bh);
+ }
+ }
+ }
+ } else {
+ /* We have reached the bottom of the tree. */
+ BUFFER_TRACE(parent_bh, "free data blocks");
+ ext4_free_data(handle, inode, parent_bh, first, last);
+ }
+}
+
+void ext4_ind_truncate(struct inode *inode)
+{
+ handle_t *handle;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *i_data = ei->i_data;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ ext4_lblk_t offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ __le32 nr = 0;
+ int n = 0;
+ ext4_lblk_t last_block, max_block;
+ loff_t page_len;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ int err;
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle))
+ return; /* AKPM: return what? */
+
+ last_block = (inode->i_size + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+ max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+ if (inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+
+ if (err)
+ goto out_stop;
+ }
+
+ if (last_block != max_block) {
+ n = ext4_block_to_path(inode, last_block, offsets, NULL);
+ if (n == 0)
+ goto out_stop; /* error */
+ }
+
+ /*
+ * OK. This truncate is going to happen. We add the inode to the
+ * orphan list, so that if this truncate spans multiple transactions,
+ * and we crash, we will resume the truncate when the filesystem
+ * recovers. It also marks the inode dirty, to catch the new size.
+ *
+ * Implication: the file must always be in a sane, consistent
+ * truncatable state while each transaction commits.
+ */
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
+
+ /*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+
+ ext4_discard_preallocations(inode);
+
+ /*
+ * The orphan list entry will now protect us from any crash which
+ * occurs before the truncate completes, so it is now safe to propagate
+ * the new, shorter inode size (held for now in i_size) into the
+ * on-disk inode. We do this via i_disksize, which is the value which
+ * ext4 *really* writes onto the disk inode.
+ */
+ ei->i_disksize = inode->i_size;
+
+ if (last_block == max_block) {
+ /*
+ * It is unnecessary to free any data blocks if last_block is
+ * equal to the indirect block limit.
+ */
+ goto out_unlock;
+ } else if (n == 1) { /* direct blocks */
+ ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+ i_data + EXT4_NDIR_BLOCKS);
+ goto do_indirects;
+ }
+
+ partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ /* Kill the top of shared branch (not detached) */
+ if (nr) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext4_free_branches(handle, inode, NULL,
+ &nr, &nr+1, (chain+n-1) - partial);
+ *partial->p = 0;
+ /*
+ * We mark the inode dirty prior to restart,
+ * and prior to stop. No need for it here.
+ */
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
+ ext4_free_branches(handle, inode, partial->bh,
+ partial->p,
+ partial->p+1, (chain+n-1) - partial);
+ }
+ }
+ /* Clear the ends of indirect blocks on the shared branch */
+ while (partial > chain) {
+ ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+ (__le32*)partial->bh->b_data+addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+do_indirects:
+ /* Kill the remaining (whole) subtrees */
+ switch (offsets[0]) {
+ default:
+ nr = i_data[EXT4_IND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+ i_data[EXT4_IND_BLOCK] = 0;
+ }
+ case EXT4_IND_BLOCK:
+ nr = i_data[EXT4_DIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+ i_data[EXT4_DIND_BLOCK] = 0;
+ }
+ case EXT4_DIND_BLOCK:
+ nr = i_data[EXT4_TIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+ i_data[EXT4_TIND_BLOCK] = 0;
+ }
+ case EXT4_TIND_BLOCK:
+ ;
+ }
+
+out_unlock:
+ up_write(&ei->i_data_sem);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+
+ /*
+ * In a multi-transaction truncate, we only make the final transaction
+ * synchronous
+ */
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+out_stop:
+ /*
+ * If this was a simple ftruncate(), and the file will remain alive
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext4_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ ext4_journal_stop(handle);
+ trace_ext4_truncate_exit(inode);
+}
+
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 00000000..c77b0bd2
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,4676 @@
+/*
+ * linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
+ * (jj@sunsite.ms.mff.cuni.cz)
+ *
+ * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "truncate.h"
+
+#include <trace/events/ext4.h>
+
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ trace_ext4_begin_ordered_truncate(inode, new_size);
+ /*
+ * If jinode is zero, then we never opened the file for
+ * writing, so there's no need to call
+ * jbd2_journal_begin_ordered_truncate() since there's no
+ * outstanding writes we need to flush.
+ */
+ if (!EXT4_I(inode)->jinode)
+ return 0;
+ return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+ struct inode *inode, struct page *page, loff_t from,
+ loff_t length, int flags);
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext4_inode_is_fast_symlink(struct inode *inode)
+{
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ (inode->i_sb->s_blocksize >> 9) : 0;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * Restart the transaction associated with *handle. This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
+{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ jbd_debug(2, "restarting handle %p\n", handle);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, nblocks);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ return ret;
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext4_evict_inode(struct inode *inode)
+{
+ handle_t *handle;
+ int err;
+
+ trace_ext4_evict_inode(inode);
+
+ ext4_ioend_wait(inode);
+
+ if (inode->i_nlink) {
+ /*
+ * When journalling data dirty buffers are tracked only in the
+ * journal. So although mm thinks everything is clean and
+ * ready for reaping the inode might still have some pages to
+ * write in the running transaction or waiting to be
+ * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * (via truncate_inode_pages()) to discard these buffers can
+ * cause data loss. Also even if we did not discard these
+ * buffers, we would have no way to find them after the inode
+ * is reaped and thus user could see stale data if he tries to
+ * read them before the transaction is checkpointed. So be
+ * careful and force everything to disk here... We use
+ * ei->i_datasync_tid to store the newest transaction
+ * containing inode's data.
+ *
+ * Note that directories do not have this problem because they
+ * don't use page cache.
+ */
+ if (ext4_should_journal_data(inode) &&
+ (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+ jbd2_log_start_commit(journal, commit_tid);
+ jbd2_log_wait_commit(journal, commit_tid);
+ filemap_write_and_wait(&inode->i_data);
+ }
+ truncate_inode_pages(&inode->i_data, 0);
+ goto no_delete;
+ }
+
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (is_bad_inode(inode))
+ goto no_delete;
+
+ handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
+ if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
+ /*
+ * If we're going to skip the normal cleanup, we still need to
+ * make sure that the in-core orphan linked list is properly
+ * cleaned up.
+ */
+ ext4_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_size = 0;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext4_warning(inode->i_sb,
+ "couldn't mark inode dirty (err %d)", err);
+ goto stop_handle;
+ }
+ if (inode->i_blocks)
+ ext4_truncate(inode);
+
+ /*
+ * ext4_ext_truncate() doesn't reserve any slop when it
+ * restarts journal transactions; therefore there may not be
+ * enough credits left in the handle to remove the inode from
+ * the orphan list and set the dtime field.
+ */
+ if (!ext4_handle_has_enough_credits(handle, 3)) {
+ err = ext4_journal_extend(handle, 3);
+ if (err > 0)
+ err = ext4_journal_restart(handle, 3);
+ if (err != 0) {
+ ext4_warning(inode->i_sb,
+ "couldn't extend journal (err %d)", err);
+ stop_handle:
+ ext4_journal_stop(handle);
+ ext4_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+ }
+
+ /*
+ * Kill off the orphan record which ext4_truncate created.
+ * AKPM: I think this can be inside the above `if'.
+ * Note that ext4_orphan_del() has to be able to cope with the
+ * deletion of a non-existent orphan - this is because we don't
+ * know if ext4_truncate() actually created an orphan record.
+ * (Well, we could do this if we need to, but heck - it works)
+ */
+ ext4_orphan_del(handle, inode);
+ EXT4_I(inode)->i_dtime = get_seconds();
+
+ /*
+ * One subtle ordering requirement: if anything has gone wrong
+ * (transaction abort, IO errors, whatever), then we can still
+ * do these next steps (the fs will already have been marked as
+ * having errors), but we can't free the inode if the mark_dirty
+ * fails.
+ */
+ if (ext4_mark_inode_dirty(handle, inode))
+ /* If that failed, just do the required in-core inode clear. */
+ ext4_clear_inode(inode);
+ else
+ ext4_free_inode(handle, inode);
+ ext4_journal_stop(handle);
+ return;
+no_delete:
+ ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
+}
+
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
+{
+ return &EXT4_I(inode)->i_reserved_quota;
+}
+#endif
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a block located at @lblock
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ext_calc_metadata_amount(inode, lblock);
+
+ return ext4_ind_calc_metadata_amount(inode, lblock);
+}
+
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ spin_lock(&ei->i_block_reservation_lock);
+ trace_ext4_da_update_reserve_space(inode, used, quota_claim);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks",
+ __func__, inode->i_ino, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ used = ei->i_reserved_data_blocks;
+ }
+
+ /* Update per-inode reservations */
+ ei->i_reserved_data_blocks -= used;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ used + ei->i_allocated_meta_blocks);
+ ei->i_allocated_meta_blocks = 0;
+
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ ei->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ /* Update quota subsystem for data blocks */
+ if (quota_claim)
+ dquot_claim_block(inode, EXT4_C2B(sbi, used));
+ else {
+ /*
+ * We did fallocate with an offset that is already delayed
+ * allocated. So on delayed allocated writeback we should
+ * not re-claim the quota for fallocated blocks.
+ */
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
+ }
+
+ /*
+ * If we have done all the pending block allocations and if
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
+ */
+ if ((ei->i_reserved_data_blocks == 0) &&
+ (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
+}
+
+static int __check_block_validity(struct inode *inode, const char *func,
+ unsigned int line,
+ struct ext4_map_blocks *map)
+{
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+ map->m_len)) {
+ ext4_error_inode(inode, func, line, map->m_pblk,
+ "lblock %lu mapped to illegal pblock "
+ "(length %d)", (unsigned long) map->m_lblk,
+ map->m_len);
+ return -EIO;
+ }
+ return 0;
+}
+
+#define check_block_validity(inode, map) \
+ __check_block_validity((inode), __func__, __LINE__, (map))
+
+/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct pagevec pvec;
+ pgoff_t num = 0;
+ int i, nr_pages, done = 0;
+
+ if (max_pages == 0)
+ return 0;
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ index = idx;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ (pgoff_t)PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ lock_page(page);
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page) ||
+ PageWriteback(page) ||
+ page->index != idx) {
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_delay(bh) &&
+ !buffer_unwritten(bh))
+ done = 1;
+ bh = bh->b_this_page;
+ } while (!done && (bh != head));
+ }
+ unlock_page(page);
+ if (done)
+ break;
+ idx++;
+ num++;
+ if (num >= max_pages) {
+ done = 1;
+ break;
+ }
+ }
+ pagevec_release(&pvec);
+ }
+ return num;
+}
+
+/*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct pagevec pvec;
+ int i, nr_pages;
+ pgoff_t index, end;
+
+ index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (map->m_lblk + map->m_len - 1) >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index,
+ min(end - index + 1,
+ (pgoff_t)PAGEVEC_SIZE));
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page))
+ break;
+
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ set_buffer_da_mapped(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+ index++;
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+/*
+ * The ext4_map_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
+ *
+ * Otherwise it takes the write lock of the i_data_sem and allocate blocks
+ * and store the allocated blocks in the result buffer head and mark it
+ * mapped.
+ *
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
+ * based files
+ *
+ * On success, it returns the number of blocks being mapped or allocate.
+ * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * the result buffer head is unmapped. If the create ==1, it will make sure
+ * the buffer head is mapped.
+ *
+ * It returns 0 if plain look up failed (blocks have not been allocated), in
+ * that case, buffer head is unmapped
+ *
+ * It returns the error in case of allocation failure.
+ */
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ int retval;
+
+ map->m_flags = 0;
+ ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, map->m_len,
+ (unsigned long) map->m_lblk);
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags &
+ EXT4_GET_BLOCKS_KEEP_SIZE);
+ }
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret = check_block_validity(inode, map);
+ if (ret != 0)
+ return ret;
+ }
+
+ /* If it is only a block(s) look up */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
+ return retval;
+
+ /*
+ * Returns if the blocks have already allocated
+ *
+ * Note that if blocks have been preallocated
+ * ext4_ext_get_block() returns the create = 0
+ * with buffer head unmapped.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+ return retval;
+
+ /*
+ * When we call get_blocks without the create flag, the
+ * BH_Unwritten flag could have gotten set if the blocks
+ * requested were part of a uninitialized extent. We need to
+ * clear this flag now that we are committed to convert all or
+ * part of the uninitialized extent to be an initialized
+ * extent. This is because we need to avoid the combination
+ * of BH_Unwritten and BH_Mapped flags being simultaneously
+ * set on the buffer_head.
+ */
+ map->m_flags &= ~EXT4_MAP_UNWRITTEN;
+
+ /*
+ * New blocks allocate and/or writing to uninitialized extent
+ * will possibly result in updating i_data, so we take
+ * the write lock of i_data_sem, and call get_blocks()
+ * with create == 1 flag.
+ */
+ down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+ /*
+ * We need to check for EXT4 here because migrate
+ * could have changed the inode type in between
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
+ } else {
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
+ /*
+ * We allocated new blocks which will result in
+ * i_data's format changing. Force the migrate
+ * to fail by clearing migrate flags
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now. We don't
+ * support fallocate for non extent files. So we can update
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
+ }
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+ /* If we have successfully mapped the delayed allocated blocks,
+ * set the BH_Da_Mapped bit on them. Its important to do this
+ * under the protection of i_data_sem.
+ */
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+ set_buffers_da_mapped(inode, map);
+ }
+
+ up_write((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret = check_block_validity(inode, map);
+ if (ret != 0)
+ return ret;
+ }
+ return retval;
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int flags)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct ext4_map_blocks map;
+ int ret = 0, started = 0;
+ int dio_credits;
+
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
+
+ if (flags && !handle) {
+ /* Direct IO write... */
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ started = 1;
+ }
+
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret > 0) {
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
+ ret = 0;
+ }
+ if (started)
+ ext4_journal_stop(handle);
+ return ret;
+}
+
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ return _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
+ ext4_lblk_t block, int create, int *errp)
+{
+ struct ext4_map_blocks map;
+ struct buffer_head *bh;
+ int fatal = 0, err;
+
+ J_ASSERT(handle != NULL || create == 0);
+
+ map.m_lblk = block;
+ map.m_len = 1;
+ err = ext4_map_blocks(handle, inode, &map,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+ if (err < 0)
+ *errp = err;
+ if (err <= 0)
+ return NULL;
+ *errp = 0;
+
+ bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!bh) {
+ *errp = -EIO;
+ return NULL;
+ }
+ if (map.m_flags & EXT4_MAP_NEW) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != NULL);
+
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext4_get_block instead, so it's not a
+ * problem.
+ */
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext4_journal_get_create_access(handle, bh);
+ if (!fatal && !buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
+ }
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
+}
+
+struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
+ ext4_lblk_t block, int create, int *err)
+{
+ struct buffer_head *bh;
+
+ bh = ext4_getblk(handle, inode, block, create, err);
+ if (!bh)
+ return bh;
+ if (buffer_uptodate(bh))
+ return bh;
+ ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return bh;
+ put_bh(bh);
+ *err = -EIO;
+ return NULL;
+}
+
+static int walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
+{
+ struct buffer_head *bh;
+ unsigned block_start, block_end;
+ unsigned blocksize = head->b_size;
+ int err, ret = 0;
+ struct buffer_head *next;
+
+ for (bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = next) {
+ next = bh->b_this_page;
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (partial && !buffer_uptodate(bh))
+ *partial = 1;
+ continue;
+ }
+ err = (*fn)(handle, bh);
+ if (!ret)
+ ret = err;
+ }
+ return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction. We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the commit_write(). So doing the jbd2_journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext4_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * has generated enough buffer credits to do the whole page. So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext4 can be reentered when a transaction is open via
+ * quota file writes. If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated. We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
+{
+ int dirty = buffer_dirty(bh);
+ int ret;
+
+ if (!buffer_mapped(bh) || buffer_freed(bh))
+ return 0;
+ /*
+ * __block_write_begin() could have dirtied some buffers. Clean
+ * the dirty bit as jbd2_journal_get_write_access() could complain
+ * otherwise about fs integrity issues. Setting of the dirty bit
+ * by __block_write_begin() isn't a real problem here as we clear
+ * the bit before releasing a page lock and thus writeback cannot
+ * ever write the buffer.
+ */
+ if (dirty)
+ clear_buffer_dirty(bh);
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (!ret && dirty)
+ ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+ return ret;
+}
+
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+static int ext4_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret, needed_blocks;
+ handle_t *handle;
+ int retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+
+ trace_ext4_write_begin(inode, pos, len, flags);
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+retry:
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, pos, len, ext4_get_block);
+
+ if (!ret && ext4_should_journal_data(inode)) {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, do_journal_get_write_access);
+ }
+
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ /*
+ * __block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before
+ * truncate finishes
+ */
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ ext4_orphan_add(handle, inode);
+
+ ext4_journal_stop(handle);
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
+{
+ if (!buffer_mapped(bh) || buffer_freed(bh))
+ return 0;
+ set_buffer_uptodate(bh);
+ return ext4_handle_dirty_metadata(handle, NULL, bh);
+}
+
+static int ext4_generic_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int i_size_changed = 0;
+ struct inode *inode = mapping->host;
+ handle_t *handle = ext4_journal_current_handle();
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos + copied > inode->i_size) {
+ i_size_write(inode, pos + copied);
+ i_size_changed = 1;
+ }
+
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_update_i_disksize(inode, (pos + copied));
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ ext4_mark_inode_dirty(handle, inode);
+
+ return copied;
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * buffers are managed internally.
+ */
+static int ext4_ordered_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+
+ trace_ext4_ordered_write_end(inode, pos, len, copied);
+ ret = ext4_jbd2_file_inode(handle, inode);
+
+ if (ret == 0) {
+ ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+
+ return ret ? ret : copied;
+}
+
+static int ext4_writeback_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+
+ trace_ext4_writeback_write_end(inode, pos, len, copied);
+ ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
+ if (ret2 < 0)
+ ret = ret2;
+
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return ret ? ret : copied;
+}
+
+static int ext4_journalled_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ int partial = 0;
+ unsigned from, to;
+ loff_t new_i_size;
+
+ trace_ext4_journalled_write_end(inode, pos, len, copied);
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
+
+ ret = walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ new_i_size = pos + copied;
+ if (new_i_size > inode->i_size)
+ i_size_write(inode, pos+copied);
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (!ret)
+ ret = ret2;
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ if (pos + len > inode->i_size) {
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return ret ? ret : copied;
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+ int retries = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int md_needed;
+ int ret;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+repeat:
+ spin_lock(&ei->i_block_reservation_lock);
+ md_needed = EXT4_NUM_B2C(sbi,
+ ext4_calc_metadata_amount(inode, lblock));
+ trace_ext4_da_reserve_space(inode, md_needed);
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ /*
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
+ */
+ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+ if (ret)
+ return ret;
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
+ if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ yield();
+ goto repeat;
+ }
+ return -ENOSPC;
+ }
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ return 0; /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!to_free)
+ return; /* Nothing to release, exit */
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ trace_ext4_da_release_space(inode, to_free);
+ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
+ /*
+ * if there aren't enough reserved blocks, then the
+ * counter is messed up somewhere. Since this
+ * function is called from invalidate page, it's
+ * harmless to return without any action.
+ */
+ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ "ino %lu, to_free %d with only %d reserved "
+ "data blocks", inode->i_ino, to_free,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ to_free = ei->i_reserved_data_blocks;
+ }
+ ei->i_reserved_data_blocks -= to_free;
+
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ * Note that in case of bigalloc, i_reserved_meta_blocks,
+ * i_reserved_data_blocks, etc. refer to number of clusters.
+ */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ ei->i_reserved_meta_blocks);
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
+
+ /* update fs dirty data blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+ struct inode *inode = page->mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int num_clusters;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ clear_buffer_da_mapped(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+
+ /* If we have released all the blocks belonging to a cluster, then we
+ * need to release the reserved space for that cluster. */
+ num_clusters = EXT4_NUM_B2C(sbi, to_release);
+ while (num_clusters > 0) {
+ ext4_fsblk_t lblk;
+ lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ ((num_clusters - 1) << sbi->s_cluster_bits);
+ if (sbi->s_cluster_ratio == 1 ||
+ !ext4_find_delalloc_cluster(inode, lblk, 1))
+ ext4_da_release_space(inode, 1);
+
+ num_clusters--;
+ }
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with writepage() call back
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd,
+ struct ext4_map_blocks *map)
+{
+ struct pagevec pvec;
+ unsigned long index, end;
+ int ret = 0, err, nr_pages, i;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t size = i_size_read(inode);
+ unsigned int len, block_start;
+ struct buffer_head *bh, *page_bufs = NULL;
+ int journal_data = ext4_should_journal_data(inode);
+ sector_t pblock = 0, cur_logical = 0;
+ struct ext4_io_submit io_submit;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+ memset(&io_submit, 0, sizeof(io_submit));
+ /*
+ * We need to start from the first_page to the next_page - 1
+ * to make sure we also write the mapped dirty buffer_heads.
+ * If we look at mpd->b_blocknr we would only be looking
+ * at the currently mapped buffer_heads.
+ */
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ pagevec_init(&pvec, 0);
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ int commit_write = 0, skip_page = 0;
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+
+ if (index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ if (map) {
+ cur_logical = index << (PAGE_CACHE_SHIFT -
+ inode->i_blkbits);
+ pblock = map->m_pblk + (cur_logical -
+ map->m_lblk);
+ }
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ /*
+ * If the page does not have buffers (for
+ * whatever reason), try to create them using
+ * __block_write_begin. If this fails,
+ * skip the page and move on.
+ */
+ if (!page_has_buffers(page)) {
+ if (__block_write_begin(page, 0, len,
+ noalloc_get_block_write)) {
+ skip_page:
+ unlock_page(page);
+ continue;
+ }
+ commit_write = 1;
+ }
+
+ bh = page_bufs = page_buffers(page);
+ block_start = 0;
+ do {
+ if (!bh)
+ goto skip_page;
+ if (map && (cur_logical >= map->m_lblk) &&
+ (cur_logical <= (map->m_lblk +
+ (map->m_len - 1)))) {
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock;
+ }
+ if (buffer_da_mapped(bh))
+ clear_buffer_da_mapped(bh);
+ if (buffer_unwritten(bh) ||
+ buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+ if (map->m_flags & EXT4_MAP_UNINIT)
+ set_buffer_uninit(bh);
+ clear_buffer_unwritten(bh);
+ }
+
+ /*
+ * skip page if block allocation undone and
+ * block is dirty
+ */
+ if (ext4_bh_delay_or_unwritten(NULL, bh))
+ skip_page = 1;
+ bh = bh->b_this_page;
+ block_start += bh->b_size;
+ cur_logical++;
+ pblock++;
+ } while (bh != page_bufs);
+
+ if (skip_page)
+ goto skip_page;
+
+ if (commit_write)
+ /* mark the buffer_heads as dirty & uptodate */
+ block_commit_write(page, 0, len);
+
+ clear_page_dirty_for_io(page);
+ /*
+ * Delalloc doesn't support data journalling,
+ * but eventually maybe we'll lift this
+ * restriction.
+ */
+ if (unlikely(journal_data && PageChecked(page)))
+ err = __ext4_journalled_writepage(page, len);
+ else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
+ err = ext4_bio_write_page(&io_submit, page,
+ len, mpd->wbc);
+ else if (buffer_uninit(page_bufs)) {
+ ext4_set_bh_endio(page_bufs, inode);
+ err = block_write_full_page_endio(page,
+ noalloc_get_block_write,
+ mpd->wbc, ext4_end_io_buffer_write);
+ } else
+ err = block_write_full_page(page,
+ noalloc_get_block_write, mpd->wbc);
+
+ if (!err)
+ mpd->pages_written++;
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ ext4_io_submit(&io_submit);
+ return ret;
+}
+
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
+{
+ int nr_pages, i;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > end)
+ break;
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ block_invalidatepage(page, 0);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ }
+ index = pvec.pages[nr_pages - 1]->index + 1;
+ pagevec_release(&pvec);
+ }
+ return;
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+
+ ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+ EXT4_C2B(EXT4_SB(inode->i_sb),
+ ext4_count_free_clusters(inode->i_sb)));
+ ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+ ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ percpu_counter_sum(&sbi->s_freeclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+ (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ ext4_msg(sb, KERN_CRIT, "Block reservation details");
+ ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
+ EXT4_I(inode)->i_reserved_meta_blocks);
+ return;
+}
+
+/*
+ * mpage_da_map_and_submit - go through given space, map them
+ * if necessary, and then submit them for I/O
+ *
+ * @mpd - bh describing space
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ */
+static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
+{
+ int err, blks, get_blocks_flags;
+ struct ext4_map_blocks map, *mapp = NULL;
+ sector_t next = mpd->b_blocknr;
+ unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+ loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ /*
+ * If the blocks are mapped already, or we couldn't accumulate
+ * any blocks, then proceed immediately to the submission stage.
+ */
+ if ((mpd->b_size == 0) ||
+ ((mpd->b_state & (1 << BH_Mapped)) &&
+ !(mpd->b_state & (1 << BH_Delay)) &&
+ !(mpd->b_state & (1 << BH_Unwritten))))
+ goto submit_io;
+
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+
+ /*
+ * Call ext4_map_blocks() to allocate any delayed allocation
+ * blocks, or to convert an uninitialized extent to be
+ * initialized (in the case where we have written into
+ * one or more preallocated blocks).
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+ * indicate that we are on the delayed allocation path. This
+ * affects functions in many different parts of the allocation
+ * call path. This flag exists primarily because we don't
+ * want to change *many* call functions, so ext4_map_blocks()
+ * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
+ * inode's allocation semaphore is taken.
+ *
+ * If the blocks in questions were delalloc blocks, set
+ * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+ * variables are updated after the blocks have been allocated.
+ */
+ map.m_lblk = next;
+ map.m_len = max_blocks;
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+ if (ext4_should_dioread_nolock(mpd->inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+ if (mpd->b_state & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+ blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
+ if (blks < 0) {
+ struct super_block *sb = mpd->inode->i_sb;
+
+ err = blks;
+ /*
+ * If get block returns EAGAIN or ENOSPC and there
+ * appears to be free blocks we will just let
+ * mpage_da_submit_io() unlock all of the pages.
+ */
+ if (err == -EAGAIN)
+ goto submit_io;
+
+ if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
+ mpd->retval = err;
+ goto submit_io;
+ }
+
+ /*
+ * get block failure will cause us to loop in
+ * writepages, because a_ops->writepage won't be able
+ * to make progress. The page will be redirtied by
+ * writepage and writepages will again try to write
+ * the same.
+ */
+ if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ext4_msg(sb, KERN_CRIT,
+ "delayed block allocation failed for inode %lu "
+ "at logical offset %llu with max blocks %zd "
+ "with error %d", mpd->inode->i_ino,
+ (unsigned long long) next,
+ mpd->b_size >> mpd->inode->i_blkbits, err);
+ ext4_msg(sb, KERN_CRIT,
+ "This should not happen!! Data will be lost\n");
+ if (err == -ENOSPC)
+ ext4_print_free_blocks(mpd->inode);
+ }
+ /* invalidate all the pages */
+ ext4_da_block_invalidatepages(mpd);
+
+ /* Mark this page range as having been completed */
+ mpd->io_done = 1;
+ return;
+ }
+ BUG_ON(blks == 0);
+
+ mapp = &map;
+ if (map.m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+ int i;
+
+ for (i = 0; i < map.m_len; i++)
+ unmap_underlying_metadata(bdev, map.m_pblk + i);
+
+ if (ext4_should_order_data(mpd->inode)) {
+ err = ext4_jbd2_file_inode(handle, mpd->inode);
+ if (err) {
+ /* Only if the journal is aborted */
+ mpd->retval = err;
+ goto submit_io;
+ }
+ }
+ }
+
+ /*
+ * Update on-disk size along with block allocation.
+ */
+ disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+ if (disksize > i_size_read(mpd->inode))
+ disksize = i_size_read(mpd->inode);
+ if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+ ext4_update_i_disksize(mpd->inode, disksize);
+ err = ext4_mark_inode_dirty(handle, mpd->inode);
+ if (err)
+ ext4_error(mpd->inode->i_sb,
+ "Failed to mark inode %lu dirty",
+ mpd->inode->i_ino);
+ }
+
+submit_io:
+ mpage_da_submit_io(mpd, mapp);
+ mpd->io_done = 1;
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+ (1 << BH_Delay) | (1 << BH_Unwritten))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, size_t b_size,
+ unsigned long b_state)
+{
+ sector_t next;
+ int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+
+ /*
+ * XXX Don't go larger than mballoc is willing to allocate
+ * This is a stopgap solution. We eventually need to fold
+ * mpage_da_submit_io() into this function and then call
+ * ext4_map_blocks() multiple times in a loop
+ */
+ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ goto flush_it;
+
+ /* check if thereserved journal credits might overflow */
+ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+ if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+ /*
+ * With non-extent format we are limited by the journal
+ * credit available. Total credit needed to insert
+ * nrblocks contiguous blocks is dependent on the
+ * nrblocks. So limit nrblocks.
+ */
+ goto flush_it;
+ } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+ EXT4_MAX_TRANS_DATA) {
+ /*
+ * Adding the new buffer_head would make it cross the
+ * allowed limit for which we have journal credit
+ * reserved. So limit the new bh->b_size
+ */
+ b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+ mpd->inode->i_blkbits;
+ /* we will do mpage_da_submit_io in the next loop */
+ }
+ }
+ /*
+ * First block in the extent
+ */
+ if (mpd->b_size == 0) {
+ mpd->b_blocknr = logical;
+ mpd->b_size = b_size;
+ mpd->b_state = b_state & BH_FLAGS;
+ return;
+ }
+
+ next = mpd->b_blocknr + nrblocks;
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+ mpd->b_size += b_size;
+ return;
+ }
+
+flush_it:
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ mpage_da_map_and_submit(mpd);
+ return;
+}
+
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+ return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+ struct ext4_map_blocks *map,
+ struct buffer_head *bh)
+{
+ int retval;
+ sector_t invalid_block = ~((sector_t) 0xffff);
+
+ if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+ invalid_block = ~0;
+
+ map->m_flags = 0;
+ ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, map->m_len,
+ (unsigned long) map->m_lblk);
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+ else
+ retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+
+ if (retval == 0) {
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ /* If the block was allocated from previously allocated cluster,
+ * then we dont need to reserve it again. */
+ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+ retval = ext4_da_reserve_space(inode, iblock);
+ if (retval)
+ /* not enough space to reserve */
+ goto out_unlock;
+ }
+
+ /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+ * and it should not appear on the bh->b_state.
+ */
+ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ }
+
+out_unlock:
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ return retval;
+}
+
+/*
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin(). It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ struct ext4_map_blocks map;
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+ map.m_lblk = iblock;
+ map.m_len = 1;
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+ if (ret <= 0)
+ return ret;
+
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+ if (buffer_unwritten(bh)) {
+ /* A delayed write to unwritten bh should be marked
+ * new and mapped. Mapped ensures that we don't do
+ * get_block multiple times when we write to the same
+ * offset and new ensures that we do proper zero out
+ * for partial write.
+ */
+ set_buffer_new(bh);
+ set_buffer_mapped(bh);
+ }
+ return 0;
+}
+
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks. It is used as a
+ * callback function for block_write_begin() and block_write_full_page().
+ * These functions should only try to map a single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling block_write_full_page(). Otherwise,
+ * b_blocknr could be left unitialized, and the page write functions will
+ * be taken by surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+ return _ext4_get_block(inode, iblock, bh_result, 0);
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+ get_bh(bh);
+ return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+ put_bh(bh);
+ return 0;
+}
+
+static int __ext4_journalled_writepage(struct page *page,
+ unsigned int len)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
+ handle_t *handle = NULL;
+ int ret = 0;
+ int err;
+
+ ClearPageChecked(page);
+ page_bufs = page_buffers(page);
+ BUG_ON(!page_bufs);
+ walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
+
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
+
+ err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ if (ret == 0)
+ ret = err;
+ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+
+ walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
+ return ret;
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
+/*
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), no one guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ * - ext4_da_writepages after taking page lock (have journal handle)
+ * - journal_submit_inode_data_buffers (no journal handle)
+ * - shrink_page_list via pdflush (no journal handle)
+ * - grab_page_cache when doing write_begin (have journal handle)
+ *
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
+ *
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
+ *
+ * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ * ext4_writepage()
+ *
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ */
+static int ext4_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0, commit_write = 0;
+ loff_t size;
+ unsigned int len;
+ struct buffer_head *page_bufs = NULL;
+ struct inode *inode = page->mapping->host;
+
+ trace_ext4_writepage(page);
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ /*
+ * If the page does not have buffers (for whatever reason),
+ * try to create them using __block_write_begin. If this
+ * fails, redirty the page and move on.
+ */
+ if (!page_has_buffers(page)) {
+ if (__block_write_begin(page, 0, len,
+ noalloc_get_block_write)) {
+ redirty_page:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ commit_write = 1;
+ }
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ /*
+ * We don't want to do block allocation, so redirty
+ * the page and return. We may reach here when we do
+ * a journal commit via journal_submit_inode_data_buffers.
+ * We can also reach here via shrink_page_list but it
+ * should never be for direct reclaim so warn if that
+ * happens
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC);
+ goto redirty_page;
+ }
+ if (commit_write)
+ /* now mark the buffer_heads as dirty and uptodate */
+ block_commit_write(page, 0, len);
+
+ if (PageChecked(page) && ext4_should_journal_data(inode))
+ /*
+ * It's mmapped pagecache. Add buffers and journal it. There
+ * doesn't seem much point in redirtying the page here.
+ */
+ return __ext4_journalled_writepage(page, len);
+
+ if (buffer_uninit(page_bufs)) {
+ ext4_set_bh_endio(page_bufs, inode);
+ ret = block_write_full_page_endio(page, noalloc_get_block_write,
+ wbc, ext4_end_io_buffer_write);
+ } else
+ ret = block_write_full_page(page, noalloc_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * This is called via ext4_da_writepages() to
+ * calculate the total number of credits to reserve to fit
+ * a single extent allocation into a single transaction,
+ * ext4_da_writpeages() will loop calling this before
+ * the block allocation.
+ */
+
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+ /*
+ * With non-extent format the journal credit needed to
+ * insert nrblocks contiguous block is dependent on
+ * number of contiguous block. So we will limit
+ * number of contiguous block to a sane value
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+ (max_blocks > EXT4_MAX_TRANS_DATA))
+ max_blocks = EXT4_MAX_TRANS_DATA;
+
+ return ext4_chunk_trans_blocks(inode, max_blocks);
+}
+
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and accumulate pages that need writing, and call
+ * mpage_da_map_and_submit to map a single contiguous memory region
+ * and then write them.
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct mpage_da_data *mpd,
+ pgoff_t *done_index)
+{
+ struct buffer_head *bh, *head;
+ struct inode *inode = mapping->host;
+ struct pagevec pvec;
+ unsigned int nr_pages;
+ sector_t logical;
+ pgoff_t index, end;
+ long nr_to_write = wbc->nr_to_write;
+ int i, tag, ret = 0;
+
+ memset(mpd, 0, sizeof(struct mpage_da_data));
+ mpd->wbc = wbc;
+ mpd->inode = inode;
+ pagevec_init(&pvec, 0);
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
+
+ *done_index = index;
+ while (index <= end) {
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ return 0;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end)
+ goto out;
+
+ *done_index = page->index + 1;
+
+ /*
+ * If we can't merge this page, and we have
+ * accumulated an contiguous region, write it
+ */
+ if ((mpd->next_page != page->index) &&
+ (mpd->next_page != mpd->first_page)) {
+ mpage_da_map_and_submit(mpd);
+ goto ret_extent_tail;
+ }
+
+ lock_page(page);
+
+ /*
+ * If the page is no longer dirty, or its
+ * mapping no longer corresponds to inode we
+ * are writing (which means it has been
+ * truncated or invalidated), or the page is
+ * already under writeback and we are not
+ * doing a data integrity writeback, skip the page
+ */
+ if (!PageDirty(page) ||
+ (PageWriteback(page) &&
+ (wbc->sync_mode == WB_SYNC_NONE)) ||
+ unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ wait_on_page_writeback(page);
+ BUG_ON(PageWriteback(page));
+
+ if (mpd->next_page != page->index)
+ mpd->first_page = page->index;
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ mpage_add_bh_to_extent(mpd, logical,
+ PAGE_CACHE_SIZE,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
+ if (mpd->io_done)
+ goto ret_extent_tail;
+ } else {
+ /*
+ * Page with regular buffer heads,
+ * just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ /*
+ * We need to try to allocate
+ * unmapped blocks in the same page.
+ * Otherwise we won't make progress
+ * with the page in ext4_writepage
+ */
+ if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_size,
+ bh->b_state);
+ if (mpd->io_done)
+ goto ret_extent_tail;
+ } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+ /*
+ * mapped dirty buffer. We need
+ * to update the b_state
+ * because we look at b_state
+ * in mpage_da_map_blocks. We
+ * don't update b_size because
+ * if we find an unmapped
+ * buffer_head later we need to
+ * use the b_state flag of that
+ * buffer_head.
+ */
+ if (mpd->b_size == 0)
+ mpd->b_state = bh->b_state & BH_FLAGS;
+ }
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ if (nr_to_write > 0) {
+ nr_to_write--;
+ if (nr_to_write == 0 &&
+ wbc->sync_mode == WB_SYNC_NONE)
+ /*
+ * We stop writing back only if we are
+ * not doing integrity sync. In case of
+ * integrity sync we have to keep going
+ * because someone may be concurrently
+ * dirtying pages, and we might have
+ * synced a lot of newly appeared dirty
+ * pages, but have not synced all of the
+ * old dirty pages.
+ */
+ goto out;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return 0;
+ret_extent_tail:
+ ret = MPAGE_DA_EXTENT_TAIL;
+out:
+ pagevec_release(&pvec);
+ cond_resched();
+ return ret;
+}
+
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ pgoff_t index;
+ int range_whole = 0;
+ handle_t *handle = NULL;
+ struct mpage_da_data mpd;
+ struct inode *inode = mapping->host;
+ int pages_written = 0;
+ unsigned int max_pages;
+ int range_cyclic, cycled = 1, io_done = 0;
+ int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ pgoff_t done_index = 0;
+ pgoff_t end;
+ struct blk_plug plug;
+
+ trace_ext4_da_writepages(inode, wbc);
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ /*
+ * If the filesystem has aborted, it is read-only, so return
+ * right away instead of dumping stack traces later on that
+ * will obscure the real source of the problem. We test
+ * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+ * the latter could be true if the filesystem is mounted
+ * read-only, and in that case, ext4_da_writepages should
+ * *never* be called, so if that ever happens, we would want
+ * the stack trace.
+ */
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
+ return -EROFS;
+
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+
+ range_cyclic = wbc->range_cyclic;
+ if (wbc->range_cyclic) {
+ index = mapping->writeback_index;
+ if (index)
+ cycled = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = LLONG_MAX;
+ wbc->range_cyclic = 0;
+ end = -1;
+ } else {
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ }
+
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole) {
+ if (wbc->nr_to_write == LONG_MAX)
+ desired_nr_to_write = wbc->nr_to_write;
+ else
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ } else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
+retry:
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+
+ blk_start_plug(&plug);
+ while (!ret && wbc->nr_to_write > 0) {
+
+ /*
+ * we insert one extent at a time. So we need
+ * credit needed for single extent allocation.
+ * journalled mode is currently not supported
+ * by delalloc
+ */
+ BUG_ON(ext4_should_journal_data(inode));
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+ "%ld pages, ino %lu; err %d", __func__,
+ wbc->nr_to_write, inode->i_ino, ret);
+ blk_finish_plug(&plug);
+ goto out_writepages;
+ }
+
+ /*
+ * Now call write_cache_pages_da() to find the next
+ * contiguous region of logical blocks that need
+ * blocks to be allocated by ext4 and submit them.
+ */
+ ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+ /*
+ * If we have a contiguous extent of pages and we
+ * haven't done the I/O yet, map the blocks and submit
+ * them for I/O.
+ */
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+ mpage_da_map_and_submit(&mpd);
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ trace_ext4_da_write_pages(inode, &mpd);
+ wbc->nr_to_write -= mpd.pages_written;
+
+ ext4_journal_stop(handle);
+
+ if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
+ jbd2_journal_force_commit_nested(sbi->s_journal);
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
+ /*
+ * Got one extent now try with rest of the pages.
+ * If mpd.retval is set -EIO, journal is aborted.
+ * So we don't need to write any more.
+ */
+ pages_written += mpd.pages_written;
+ ret = mpd.retval;
+ io_done = 1;
+ } else if (wbc->nr_to_write)
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ break;
+ }
+ blk_finish_plug(&plug);
+ if (!io_done && !cycled) {
+ cycled = 1;
+ index = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = mapping->writeback_index - 1;
+ goto retry;
+ }
+
+ /* Update index */
+ wbc->range_cyclic = range_cyclic;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = done_index;
+
+out_writepages:
+ wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
+ trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ return ret;
+}
+
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+ s64 free_blocks, dirty_blocks;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * switch to non delalloc mode if we are running low
+ * on free block. The free block accounting via percpu
+ * counters can get slightly wrong with percpu_counter_batch getting
+ * accumulated on each CPU without updating global counters
+ * Delalloc need an accurate free block accounting. So switch
+ * to non delalloc when we are near to error range.
+ */
+ free_blocks = EXT4_C2B(sbi,
+ percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+ dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+ if (2 * free_blocks < 3 * dirty_blocks ||
+ free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
+ /*
+ * free block count is less than 150% of dirty blocks
+ * or free blocks is less than watermark
+ */
+ return 1;
+ }
+ /*
+ * Even if we don't switch but are nearing capacity,
+ * start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (free_blocks < 2 * dirty_blocks)
+ writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
+
+ return 0;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+
+ if (ext4_nonda_switch(inode->i_sb)) {
+ *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+ return ext4_write_begin(file, mapping, pos,
+ len, flags, pagep, fsdata);
+ }
+ *fsdata = (void *)0;
+ trace_ext4_da_write_begin(inode, pos, len, flags);
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
+ ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ ext4_truncate_failed_write(inode);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i = 0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+ int write_mode = (int)(unsigned long)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ return ext4_ordered_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ return ext4_writeback_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ default:
+ BUG();
+ }
+ }
+
+ trace_ext4_da_write_end(inode, pos, len, copied);
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied - 1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+ trace_ext4_alloc_da_blocks(inode);
+
+ if (!EXT4_I(inode)->i_reserved_data_blocks &&
+ !EXT4_I(inode)->i_reserved_meta_blocks)
+ return 0;
+
+ /*
+ * We do something simple for now. The filemap_flush() will
+ * also start triggering a write of the data blocks, which is
+ * not strictly speaking necessary (and for users of
+ * laptop_mode, not even desirable). However, to do otherwise
+ * would require replicating code paths in:
+ *
+ * ext4_da_writepages() ->
+ * write_cache_pages() ---> (via passed in callback function)
+ * __mpage_da_writepage() -->
+ * mpage_add_bh_to_extent()
+ * mpage_da_map_blocks()
+ *
+ * The problem is that write_cache_pages(), located in
+ * mm/page-writeback.c, marks pages clean in preparation for
+ * doing I/O, which is not desirable if we're not planning on
+ * doing I/O at all.
+ *
+ * We could call write_cache_pages(), and then redirty all of
+ * the pages by calling redirty_page_for_writepage() but that
+ * would be ugly in the extreme. So instead we would need to
+ * replicate parts of the code in the above functions,
+ * simplifying them because we wouldn't actually intend to
+ * write out the pages, but rather only collect contiguous
+ * logical block extents, call the multi-block allocator, and
+ * then update the buffer heads with the block allocations.
+ *
+ * For now, though, we'll cheat by calling filemap_flush(),
+ * which will map the blocks, and start the I/O, but not
+ * actually wait for the I/O to complete.
+ */
+ return filemap_flush(inode->i_mapping);
+}
+
+/*
+ * bmap() is special. It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal. If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+ journal_t *journal;
+ int err;
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
+ /*
+ * This is a REALLY heavyweight approach, but the use of
+ * bmap on dirty files is expected to be extremely rare:
+ * only if we run lilo or swapon on a freshly made file
+ * do we expect this to happen.
+ *
+ * (bmap requires CAP_SYS_RAWIO so this does not
+ * represent an unprivileged user DOS attack --- we'd be
+ * in trouble if mortal users could trigger this path at
+ * will.)
+ *
+ * NB. EXT4_STATE_JDATA is not set on files other than
+ * regular files. If somebody wants to bmap a directory
+ * or symlink and gets confused because the buffer
+ * hasn't yet been flushed to disk, they deserve
+ * everything they get.
+ */
+
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+ journal = EXT4_JOURNAL(inode);
+ jbd2_journal_lock_updates(journal);
+ err = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
+
+ if (err)
+ return 0;
+ }
+
+ return generic_block_bmap(mapping, block, ext4_get_block);
+}
+
+static int ext4_readpage(struct file *file, struct page *page)
+{
+ trace_ext4_readpage(page);
+ return mpage_readpage(page, ext4_get_block);
+}
+
+static int
+ext4_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ if (!page_has_buffers(page))
+ return;
+ head = bh = page_buffers(page);
+ do {
+ if (offset <= curr_off && test_clear_buffer_uninit(bh)
+ && bh->b_private) {
+ ext4_free_io_end(bh->b_private);
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ }
+ curr_off = curr_off + bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset)
+{
+ journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+ trace_ext4_invalidatepage(page, offset);
+
+ /*
+ * free any io_end structure allocated for buffers to be discarded
+ */
+ if (ext4_should_dioread_nolock(page->mapping->host))
+ ext4_invalidatepage_free_endio(page, offset);
+ /*
+ * If it's a full truncate we just forget about the pending dirtying
+ */
+ if (offset == 0)
+ ClearPageChecked(page);
+
+ if (journal)
+ jbd2_journal_invalidatepage(journal, page, offset);
+ else
+ block_invalidatepage(page, offset);
+}
+
+static int ext4_releasepage(struct page *page, gfp_t wait)
+{
+ journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+ trace_ext4_releasepage(page);
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ else
+ return try_to_free_buffers(page);
+}
+
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private, int ret,
+ bool is_async)
+{
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+ unsigned long flags;
+ struct ext4_inode_info *ei;
+
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ goto out;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p "
+ "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ iocb->private = NULL;
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ ext4_free_io_end(io_end);
+out:
+ if (is_async)
+ aio_complete(iocb, ret, 0);
+ inode_dio_done(inode);
+ return;
+ }
+
+ io_end->offset = offset;
+ io_end->size = size;
+ if (is_async) {
+ io_end->iocb = iocb;
+ io_end->result = ret;
+ }
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ ei = EXT4_I(io_end->inode);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+}
+
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+ ext4_io_end_t *io_end = bh->b_private;
+ struct workqueue_struct *wq;
+ struct inode *inode;
+ unsigned long flags;
+
+ if (!test_clear_buffer_uninit(bh) || !io_end)
+ goto out;
+
+ if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+ ext4_msg(io_end->inode->i_sb, KERN_INFO,
+ "sb umounted, discard end_io request for inode %lu",
+ io_end->inode->i_ino);
+ ext4_free_io_end(io_end);
+ goto out;
+ }
+
+ /*
+ * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
+ * but being more careful is always safe for the future change.
+ */
+ inode = io_end->inode;
+ ext4_set_io_unwritten_flag(inode, io_end);
+
+ /* Add the io_end to per-inode completed io list*/
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+ wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+out:
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ clear_buffer_uninit(bh);
+ end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+ ext4_io_end_t *io_end;
+ struct page *page = bh->b_page;
+ loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+ size_t size = bh->b_size;
+
+retry:
+ io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+ if (!io_end) {
+ pr_warn_ratelimited("%s: allocation fail\n", __func__);
+ schedule();
+ goto retry;
+ }
+ io_end->offset = offset;
+ io_end->size = size;
+ /*
+ * We need to hold a reference to the page to make sure it
+ * doesn't get evicted before ext4_end_io_work() has a chance
+ * to convert the extent from written to unwritten.
+ */
+ io_end->page = page;
+ get_page(io_end->page);
+
+ bh->b_private = io_end;
+ bh->b_end_io = ext4_end_io_buffer_write;
+ return 0;
+}
+
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as uninitialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as uninitialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the conversion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent parallel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
+ */
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ ext4_io_end_t *io_end =
+ ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end)
+ return -ENOMEM;
+ io_end->flag |= EXT4_IO_END_DIRECT;
+ iocb->private = io_end;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_map_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_write,
+ ext4_end_io_dio,
+ NULL,
+ DIO_LOCKING);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ }
+ return ret;
+ }
+
+ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ /*
+ * If we are doing data journalling we don't support O_DIRECT
+ */
+ if (ext4_should_journal_data(inode))
+ return 0;
+
+ trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+ else
+ ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ trace_ext4_direct_IO_exit(inode, offset,
+ iov_length(iov, nr_segs), rw, ret);
+ return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
+ * much here because ->set_page_dirty is called under VFS locks. The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive". We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext4_journalled_set_page_dirty(struct page *page)
+{
+ SetPageChecked(page);
+ return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext4_ordered_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_ordered_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_writeback_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_writeback_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_journalled_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_journalled_write_end,
+ .set_page_dirty = ext4_journalled_set_page_dirty,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writepage,
+ .writepages = ext4_da_writepages,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+void ext4_set_aops(struct inode *inode)
+{
+ switch (ext4_inode_journal_mode(inode)) {
+ case EXT4_INODE_ORDERED_DATA_MODE:
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_ordered_aops;
+ break;
+ case EXT4_INODE_WRITEBACK_DATA_MODE:
+ if (test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_writeback_aops;
+ break;
+ case EXT4_INODE_JOURNAL_DATA_MODE:
+ inode->i_mapping->a_ops = &ext4_journalled_aops;
+ break;
+ default:
+ BUG();
+ }
+}
+
+
+/*
+ * ext4_discard_partial_page_buffers()
+ * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * This function finds and locks the page containing the offset
+ * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * Calling functions that already have the page locked should call
+ * ext4_discard_partial_page_buffers_no_lock directly.
+ */
+int ext4_discard_partial_page_buffers(handle_t *handle,
+ struct address_space *mapping, loff_t from,
+ loff_t length, int flags)
+{
+ struct inode *inode = mapping->host;
+ struct page *page;
+ int err = 0;
+
+ page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+ mapping_gfp_mask(mapping) & ~__GFP_FS);
+ if (!page)
+ return -ENOMEM;
+
+ err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
+ from, length, flags);
+
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+/*
+ * ext4_discard_partial_page_buffers_no_lock()
+ * Zeros a page range of length 'length' starting from offset 'from'.
+ * Buffer heads that correspond to the block aligned regions of the
+ * zeroed range will be unmapped. Unblock aligned regions
+ * will have the corresponding buffer head mapped if needed so that
+ * that region of the page can be updated with the partial zero out.
+ *
+ * This function assumes that the page has already been locked. The
+ * The range to be discarded must be contained with in the given page.
+ * If the specified range exceeds the end of the page it will be shortened
+ * to the end of the page that corresponds to 'from'. This function is
+ * appropriate for updating a page and it buffer heads to be unmapped and
+ * zeroed for blocks that have been either released, or are going to be
+ * released.
+ *
+ * handle: The journal handle
+ * inode: The files inode
+ * page: A locked page that contains the offset "from"
+ * from: The starting byte offset (from the begining of the file)
+ * to begin discarding
+ * len: The length of bytes to discard
+ * flags: Optional flags that may be used:
+ *
+ * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
+ * Only zero the regions of the page whose buffer heads
+ * have already been unmapped. This flag is appropriate
+ * for updateing the contents of a page whose blocks may
+ * have already been released, and we only want to zero
+ * out the regions that correspond to those released blocks.
+ *
+ * Returns zero on sucess or negative on failure.
+ */
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+ struct inode *inode, struct page *page, loff_t from,
+ loff_t length, int flags)
+{
+ ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned int blocksize, max, pos;
+ ext4_lblk_t iblock;
+ struct buffer_head *bh;
+ int err = 0;
+
+ blocksize = inode->i_sb->s_blocksize;
+ max = PAGE_CACHE_SIZE - offset;
+
+ if (index != page->index)
+ return -EINVAL;
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the page
+ */
+ if (length > max || length < 0)
+ length = max;
+
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ /* Find the buffer that contains "offset" */
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ pos = offset;
+ while (pos < offset + length) {
+ unsigned int end_of_block, range_to_discard;
+
+ err = 0;
+
+ /* The length of space left to zero and unmap */
+ range_to_discard = offset + length - pos;
+
+ /* The length of space until the end of the block */
+ end_of_block = blocksize - (pos & (blocksize-1));
+
+ /*
+ * Do not unmap or zero past end of block
+ * for this buffer head
+ */
+ if (range_to_discard > end_of_block)
+ range_to_discard = end_of_block;
+
+
+ /*
+ * Skip this buffer head if we are only zeroing unampped
+ * regions of the page
+ */
+ if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
+ buffer_mapped(bh))
+ goto next;
+
+ /* If the range is block aligned, unmap */
+ if (range_to_discard == blocksize) {
+ clear_buffer_dirty(bh);
+ bh->b_bdev = NULL;
+ clear_buffer_mapped(bh);
+ clear_buffer_req(bh);
+ clear_buffer_new(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
+ clear_buffer_uptodate(bh);
+ zero_user(page, pos, range_to_discard);
+ BUFFER_TRACE(bh, "Buffer discarded");
+ goto next;
+ }
+
+ /*
+ * If this block is not completely contained in the range
+ * to be discarded, then it is not going to be released. Because
+ * we need to keep this block, we need to make sure this part
+ * of the page is uptodate before we modify it by writeing
+ * partial zeros on it.
+ */
+ if (!buffer_mapped(bh)) {
+ /*
+ * Buffer head must be mapped before we can read
+ * from the block
+ */
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "still unmapped");
+ goto next;
+ }
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt.*/
+ if (!buffer_uptodate(bh))
+ goto next;
+ }
+
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto next;
+ }
+
+ zero_user(page, pos, range_to_discard);
+
+ err = 0;
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ } else
+ mark_buffer_dirty(bh);
+
+ BUFFER_TRACE(bh, "Partial buffer zeroed");
+next:
+ bh = bh->b_this_page;
+ iblock++;
+ pos += range_to_discard;
+ }
+
+ return err;
+}
+
+int ext4_can_truncate(struct inode *inode)
+{
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
+/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ /* TODO: Add support for non extent hole punching */
+ return -EOPNOTSUPP;
+ }
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+ /* TODO: Add support for bigalloc file systems */
+ return -EOPNOTSUPP;
+ }
+
+ return ext4_ext_punch_hole(file, offset, length);
+}
+
+/*
+ * ext4_truncate()
+ *
+ * We block out ext4_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext4_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk. We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable. It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext4_truncate() to have another go. So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext4 filesystem. But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext4_truncate() run will find them and release them.
+ */
+void ext4_truncate(struct inode *inode)
+{
+ trace_ext4_truncate_enter(inode);
+
+ if (!ext4_can_truncate(inode))
+ return;
+
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ext4_ext_truncate(inode);
+ else
+ ext4_ind_truncate(inode);
+
+ trace_ext4_truncate_exit(inode);
+}
+
+/*
+ * ext4_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext4_get_inode_loc(struct inode *inode,
+ struct ext4_iloc *iloc, int in_mem)
+{
+ struct ext4_group_desc *gdp;
+ struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
+ ext4_fsblk_t block;
+ int inodes_per_block, inode_offset;
+
+ iloc->bh = NULL;
+ if (!ext4_valid_inum(sb, inode->i_ino))
+ return -EIO;
+
+ iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+ if (!gdp)
+ return -EIO;
+
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ inode_offset = ((inode->i_ino - 1) %
+ EXT4_INODES_PER_GROUP(sb));
+ block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+ iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+ bh = sb_getblk(sb, block);
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, block,
+ "unable to read itable block");
+ return -EIO;
+ }
+ if (!buffer_uptodate(bh)) {
+ lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
+ if (buffer_uptodate(bh)) {
+ /* someone brought it uptodate while we waited */
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+
+ /*
+ * If we have all information of the inode in memory and this
+ * is the only valid inode in the block, we need not read the
+ * block.
+ */
+ if (in_mem) {
+ struct buffer_head *bitmap_bh;
+ int i, start;
+
+ start = inode_offset & ~(inodes_per_block - 1);
+
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+ if (!bitmap_bh)
+ goto make_io;
+
+ /*
+ * If the inode bitmap isn't in cache then the
+ * optimisation may end up performing two reads instead
+ * of one, so skip it.
+ */
+ if (!buffer_uptodate(bitmap_bh)) {
+ brelse(bitmap_bh);
+ goto make_io;
+ }
+ for (i = start; i < start + inodes_per_block; i++) {
+ if (i == inode_offset)
+ continue;
+ if (ext4_test_bit(i, bitmap_bh->b_data))
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i == start + inodes_per_block) {
+ /* all other inodes are free, so skip I/O */
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+ }
+
+make_io:
+ /*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+
+ table = ext4_inode_table(sb, gdp);
+ /* s_inode_readahead_blks is always a power of 2 */
+ b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+ if (table > b)
+ b = table;
+ end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ num -= ext4_itable_unused_count(sb, gdp);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ sb_breadahead(sb, b++);
+ }
+
+ /*
+ * There are other valid inodes in the buffer, this inode
+ * has in-inode xattrs, or we don't have this inode in memory.
+ * Read the block from disk.
+ */
+ trace_ext4_load_inode(inode);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ | REQ_META | REQ_PRIO, bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ EXT4_ERROR_INODE_BLOCK(inode, block,
+ "unable to read itable block");
+ brelse(bh);
+ return -EIO;
+ }
+ }
+has_buffer:
+ iloc->bh = bh;
+ return 0;
+}
+
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+{
+ /* We have all inode data except xattrs in memory here. */
+ return __ext4_get_inode_loc(inode, iloc,
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+}
+
+void ext4_set_inode_flags(struct inode *inode)
+{
+ unsigned int flags = EXT4_I(inode)->i_flags;
+
+ inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ if (flags & EXT4_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & EXT4_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & EXT4_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & EXT4_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ if (flags & EXT4_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
+void ext4_get_inode_flags(struct ext4_inode_info *ei)
+{
+ unsigned int vfs_fl;
+ unsigned long old_fl, new_fl;
+
+ do {
+ vfs_fl = ei->vfs_inode.i_flags;
+ old_fl = ei->i_flags;
+ new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+ EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+ EXT4_DIRSYNC_FL);
+ if (vfs_fl & S_SYNC)
+ new_fl |= EXT4_SYNC_FL;
+ if (vfs_fl & S_APPEND)
+ new_fl |= EXT4_APPEND_FL;
+ if (vfs_fl & S_IMMUTABLE)
+ new_fl |= EXT4_IMMUTABLE_FL;
+ if (vfs_fl & S_NOATIME)
+ new_fl |= EXT4_NOATIME_FL;
+ if (vfs_fl & S_DIRSYNC)
+ new_fl |= EXT4_DIRSYNC_FL;
+ } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
+}
+
+static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ blkcnt_t i_blocks ;
+ struct inode *inode = &(ei->vfs_inode);
+ struct super_block *sb = inode->i_sb;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ /* we are using combined 48 bit field */
+ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
+ le32_to_cpu(raw_inode->i_blocks_lo);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
+ /* i_blocks represent file system block size */
+ return i_blocks << (inode->i_blkbits - 9);
+ } else {
+ return i_blocks;
+ }
+ } else {
+ return le32_to_cpu(raw_inode->i_blocks_lo);
+ }
+}
+
+struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_iloc iloc;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode_info *ei;
+ struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+ long ret;
+ int block;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ ei = EXT4_I(inode);
+ iloc.bh = NULL;
+
+ ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (ret < 0)
+ goto bad_inode;
+ raw_inode = ext4_raw_inode(&iloc);
+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ }
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
+
+ ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_dir_start_lookup = 0;
+ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ /* We now have enough fields to check if the inode was active or not.
+ * This is needed because nfsd might try to access dead inodes
+ * the test is that same one that e2fsck uses
+ * NeilBrown 1999oct15
+ */
+ if (inode->i_nlink == 0) {
+ if (inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ /* this inode is deleted */
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+ /* The only unlinked inodes we let through here have
+ * valid i_mode and are being read by the orphan
+ * recovery code: that's fine, we're about to complete
+ * the process of deleting those. */
+ }
+ ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+ inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
+ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ ei->i_file_acl |=
+ ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+ inode->i_size = ext4_isize(raw_inode);
+ ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
+ inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+ ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
+ /*
+ * NOTE! The in-memory inode i_data array is in little-endian order
+ * even on big-endian machines: we do NOT byteswap the block numbers!
+ */
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ ei->i_data[block] = raw_inode->i_block[block];
+ INIT_LIST_HEAD(&ei->i_orphan);
+
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+ ret = -EIO;
+ goto bad_inode;
+ }
+ if (ei->i_extra_isize == 0) {
+ /* The extra space is currently unused. Use it. */
+ ei->i_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ } else {
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE +
+ ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
+
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
+
+ ret = 0;
+ if (ei->i_file_acl &&
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ei->i_file_acl);
+ ret = -EIO;
+ goto bad_inode;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode)))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
+ if (ret)
+ goto bad_inode;
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &ext4_dir_inode_operations;
+ inode->i_fop = &ext4_dir_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+ if (ext4_inode_is_fast_symlink(inode)) {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
+ inode->i_op = &ext4_symlink_inode_operations;
+ ext4_set_aops(inode);
+ }
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+ inode->i_op = &ext4_special_inode_operations;
+ if (raw_inode->i_block[0])
+ init_special_inode(inode, inode->i_mode,
+ old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+ else
+ init_special_inode(inode, inode->i_mode,
+ new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else {
+ ret = -EIO;
+ EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+ goto bad_inode;
+ }
+ brelse(iloc.bh);
+ ext4_set_inode_flags(inode);
+ unlock_new_inode(inode);
+ return inode;
+
+bad_inode:
+ brelse(iloc.bh);
+ iget_failed(inode);
+ return ERR_PTR(ret);
+}
+
+static int ext4_inode_blocks_set(handle_t *handle,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ struct inode *inode = &(ei->vfs_inode);
+ u64 i_blocks = inode->i_blocks;
+ struct super_block *sb = inode->i_sb;
+
+ if (i_blocks <= ~0U) {
+ /*
+ * i_blocks can be represnted in a 32 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = 0;
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ return 0;
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ return -EFBIG;
+
+ if (i_blocks <= 0xffffffffffffULL) {
+ /*
+ * i_blocks can be represented in a 48 bit variable
+ * as multiple of 512 bytes
+ */
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ } else {
+ ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+ /* i_block is stored in file system block size */
+ i_blocks = i_blocks >> (inode->i_blkbits - 9);
+ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
+ raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
+ }
+ return 0;
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache. This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct buffer_head *bh = iloc->bh;
+ int err = 0, rc, block;
+
+ /* For fields not not tracking in the in-memory inode,
+ * initialise them to zero for new inodes. */
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+ ext4_get_inode_flags(ei);
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+ if (!ei->i_dtime) {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(inode->i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(inode->i_gid));
+ } else {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ } else {
+ raw_inode->i_uid_low =
+ cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ raw_inode->i_gid_low =
+ cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
+
+ if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ goto out_brelse;
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+ ext4_isize_set(raw_inode, ei->i_disksize);
+ if (ei->i_disksize > 0x7fffffffULL) {
+ struct super_block *sb = inode->i_sb;
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ EXT4_SB(sb)->s_es->s_rev_level ==
+ cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+ /* If this is the first large file
+ * created, add a flag to the superblock.
+ */
+ err = ext4_journal_get_write_access(handle,
+ EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
+ }
+ raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ raw_inode->i_block[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ raw_inode->i_block[1] = 0;
+ } else {
+ raw_inode->i_block[0] = 0;
+ raw_inode->i_block[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ raw_inode->i_block[2] = 0;
+ }
+ } else
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ }
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (!err)
+ err = rc;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+
+ ext4_update_inode_fsync_trans(handle, inode, 0);
+out_brelse:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/*
+ * ext4_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ * Here, there will be no transaction running. We wait for any running
+ * trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ * We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ * Here we simply return. We can't afford to block kswapd on the
+ * journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this. The code:
+ *
+ * mark_inode_dirty(inode)
+ * stuff();
+ * inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost. Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ int err;
+
+ if (current->flags & PF_MEMALLOC)
+ return 0;
+
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_ALL)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
+
+ err = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (err)
+ return err;
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+ "IO error syncing inode");
+ err = -EIO;
+ }
+ brelse(iloc.bh);
+ }
+ return err;
+}
+
+/*
+ * ext4_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible. In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk. (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
+ */
+int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ int error, rc = 0;
+ int orphan = 0;
+ const unsigned int ia_valid = attr->ia_valid;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (is_quota_modification(inode, attr))
+ dquot_initialize(inode);
+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ handle_t *handle;
+
+ /* (user+group)*(old+new) structure, inode write (sb,
+ * inode block, ? - but truncate inode update has it) */
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+ error = dquot_transfer(inode, attr);
+ if (error) {
+ ext4_journal_stop(handle);
+ return error;
+ }
+ /* Update corresponding info in inode so that everything is in
+ * one transaction */
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ error = ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ inode_dio_wait(inode);
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ return -EFBIG;
+ }
+ }
+
+ if (S_ISREG(inode->i_mode) &&
+ attr->ia_valid & ATTR_SIZE &&
+ (attr->ia_size < inode->i_size)) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+ if (ext4_handle_valid(handle)) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
+ }
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ orphan = 0;
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
+ }
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_size != i_size_read(inode))
+ truncate_setsize(inode, attr->ia_size);
+ ext4_truncate(inode);
+ }
+
+ if (!rc) {
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ }
+
+ /*
+ * If the call to ext4_truncate failed to get a transaction handle at
+ * all, we need to clean up the in-core orphan list manually.
+ */
+ if (orphan && inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ if (!rc && (ia_valid & ATTR_MODE))
+ rc = ext4_acl_chmod(inode);
+
+err_out:
+ ext4_std_error(inode->i_sb, error);
+ if (!error)
+ error = rc;
+ return error;
+}
+
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return ext4_ind_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+}
+
+/*
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiuguous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
+ */
+static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+ int gdpblocks;
+ int idxblocks;
+ int ret = 0;
+
+ /*
+ * How many index blocks need to touch to modify nrblocks?
+ * The "Chunk" flag indicating whether the nrblocks is
+ * physically contiguous on disk
+ *
+ * For Direct IO and fallocate, they calls get_block to allocate
+ * one single extent at a time, so they could set the "Chunk" flag
+ */
+ idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+
+ ret = idxblocks;
+
+ /*
+ * Now let's see how many group bitmaps and group descriptors need
+ * to account
+ */
+ groups = idxblocks;
+ if (chunk)
+ groups += 1;
+ else
+ groups += nrblocks;
+
+ gdpblocks = groups;
+ if (groups > ngroups)
+ groups = ngroups;
+ if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+ gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+ /* bitmaps and block group descriptor blocks */
+ ret += groups + gdpblocks;
+
+ /* Blocks for super block, inode, quota and xattr blocks */
+ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * Calculate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
+int ext4_writepage_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+ int ret;
+
+ ret = ext4_meta_trans_blocks(inode, bpp, 0);
+
+ /* Account for data blocks for journalled mode */
+ if (ext4_should_journal_data(inode))
+ ret += bpp;
+ return ret;
+}
+
+/*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+ return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
+ * The caller must have previously called ext4_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext4_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode, struct ext4_iloc *iloc)
+{
+ int err = 0;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+
+ /* the do_update_inode consumes one bh->b_count */
+ get_bh(iloc->bh);
+
+ /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+ err = ext4_do_update_inode(handle, inode, iloc);
+ put_bh(iloc->bh);
+ return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int
+ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ int err;
+
+ err = ext4_get_inode_loc(inode, iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err) {
+ brelse(iloc->bh);
+ iloc->bh = NULL;
+ }
+ }
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+static int ext4_expand_extra_isize(struct inode *inode,
+ unsigned int new_extra_isize,
+ struct ext4_iloc iloc,
+ handle_t *handle)
+{
+ struct ext4_inode *raw_inode;
+ struct ext4_xattr_ibody_header *header;
+
+ if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
+ return 0;
+
+ raw_inode = ext4_raw_inode(&iloc);
+
+ header = IHDR(inode, raw_inode);
+
+ /* No extended attributes present */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+ new_extra_isize);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ return 0;
+ }
+
+ /* try to expand with EAs present */
+ return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+ raw_inode, handle);
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O. This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating? Not really. Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective? Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O. But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out. One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory. It has the desired
+ * effect.
+ */
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+ struct ext4_iloc iloc;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ static unsigned int mnt_count;
+ int err, ret;
+
+ might_sleep();
+ trace_ext4_mark_inode_dirty(inode, _RET_IP_);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (ext4_handle_valid(handle) &&
+ EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
+ /*
+ * We need extra buffer credits since we may write into EA block
+ * with this same handle. If journal_extend fails, then it will
+ * only result in a minor loss of functionality for that inode.
+ * If this is felt to be critical, then e2fsck should be run to
+ * force a large enough s_min_extra_isize.
+ */
+ if ((jbd2_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+ ret = ext4_expand_extra_isize(inode,
+ sbi->s_want_extra_isize,
+ iloc, handle);
+ if (ret) {
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
+ if (mnt_count !=
+ le16_to_cpu(sbi->s_es->s_mnt_count)) {
+ ext4_warning(inode->i_sb,
+ "Unable to expand inode %lu. Delete"
+ " some EAs or run e2fsck.",
+ inode->i_ino);
+ mnt_count =
+ le16_to_cpu(sbi->s_es->s_mnt_count);
+ }
+ }
+ }
+ }
+ if (!err)
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ return err;
+}
+
+/*
+ * ext4_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext4_dirty_inode(struct inode *inode, int flags)
+{
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, 2);
+ if (IS_ERR(handle))
+ goto out;
+
+ ext4_mark_inode_dirty(handle, inode);
+
+ ext4_journal_stop(handle);
+out:
+ return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early. Unlike
+ * ext4_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext4_pin_inode(handle_t *handle, struct inode *inode)
+{
+ struct ext4_iloc iloc;
+
+ int err = 0;
+ if (handle) {
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc.bh, "get_write_access");
+ err = jbd2_journal_get_write_access(handle, iloc.bh);
+ if (!err)
+ err = ext4_handle_dirty_metadata(handle,
+ NULL,
+ iloc.bh);
+ brelse(iloc.bh);
+ }
+ }
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+#endif
+
+int ext4_change_inode_journal_flag(struct inode *inode, int val)
+{
+ journal_t *journal;
+ handle_t *handle;
+ int err;
+
+ /*
+ * We have to be very careful here: changing a data block's
+ * journaling status dynamically is dangerous. If we write a
+ * data block to the journal, change the status and then delete
+ * that block, we risk forgetting to revoke the old log record
+ * from the journal and so a subsequent replay can corrupt data.
+ * So, first we make sure that the journal is empty and that
+ * nobody is changing anything.
+ */
+
+ journal = EXT4_JOURNAL(inode);
+ if (!journal)
+ return 0;
+ if (is_journal_aborted(journal))
+ return -EROFS;
+ /* We have to allocate physical blocks for delalloc blocks
+ * before flushing journal. otherwise delalloc blocks can not
+ * be allocated any more. even more truncate on delalloc blocks
+ * could trigger BUG by flushing delalloc blocks in journal.
+ * There is no delalloc block in non-journal data mode.
+ */
+ if (val && test_opt(inode->i_sb, DELALLOC)) {
+ err = ext4_alloc_da_blocks(inode);
+ if (err < 0)
+ return err;
+ }
+
+ jbd2_journal_lock_updates(journal);
+
+ /*
+ * OK, there are no updates running now, and all cached data is
+ * synced to disk. We are now in a completely consistent state
+ * which doesn't have anything in the journal, and we know that
+ * no filesystem updates are running, so it is safe to modify
+ * the inode's in-core data-journaling state flag now.
+ */
+
+ if (val)
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ else {
+ jbd2_journal_flush(journal);
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ }
+ ext4_set_aops(inode);
+
+ jbd2_journal_unlock_updates(journal);
+
+ /* Finally we can mark the inode as dirty. */
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ ext4_handle_sync(handle);
+ ext4_journal_stop(handle);
+ ext4_std_error(inode->i_sb, err);
+
+ return err;
+}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ loff_t size;
+ unsigned long len;
+ int ret;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle;
+ get_block_t *get_block;
+ int retries = 0;
+
+ /*
+ * This check is racy but catches the common case. We rely on
+ * __block_page_mkwrite() to do a reliable check.
+ */
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ /* Delalloc case is easy... */
+ if (test_opt(inode->i_sb, DELALLOC) &&
+ !ext4_should_journal_data(inode) &&
+ !ext4_nonda_switch(inode->i_sb)) {
+ do {
+ ret = __block_page_mkwrite(vma, vmf,
+ ext4_da_get_block_prep);
+ } while (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
+ goto out_ret;
+ }
+
+ lock_page(page);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+ /*
+ * Return if we have all the buffers mapped. This avoids the need to do
+ * journal_start/journal_stop which can block and take a long time
+ */
+ if (page_has_buffers(page)) {
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped)) {
+ /* Wait so that we don't change page under IO */
+ wait_on_page_writeback(page);
+ ret = VM_FAULT_LOCKED;
+ goto out;
+ }
+ }
+ unlock_page(page);
+ /* OK, we need to fill the hole... */
+ if (ext4_should_dioread_nolock(inode))
+ get_block = ext4_get_block_write;
+ else
+ get_block = ext4_get_block;
+retry_alloc:
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ if (!ret && ext4_should_journal_data(inode)) {
+ if (walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ unlock_page(page);
+ ret = VM_FAULT_SIGBUS;
+ ext4_journal_stop(handle);
+ goto out;
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ }
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_alloc;
+out_ret:
+ ret = block_page_mkwrite_return(ret);
+out:
+ return ret;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 00000000..1365903a
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,509 @@
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/capability.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int flags;
+
+ ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
+ case EXT4_IOC_GETFLAGS:
+ ext4_get_inode_flags(ei);
+ flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *) arg);
+ case EXT4_IOC_SETFLAGS: {
+ handle_t *handle = NULL;
+ int err, migrate = 0;
+ struct ext4_iloc iloc;
+ unsigned int oldflags, mask, i;
+ unsigned int jflag;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ flags = ext4_mask_flags(inode->i_mode, flags);
+
+ err = -EPERM;
+ mutex_lock(&inode->i_mutex);
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
+
+ oldflags = ei->i_flags;
+
+ /* The JOURNAL_DATA flag is modifiable only by root */
+ jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ *
+ * This test looks nicer. Thanks to Pauline Middelink
+ */
+ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ goto flags_out;
+ }
+
+ /*
+ * The JOURNAL_DATA flag can only be changed by
+ * the relevant capability.
+ */
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
+ }
+ if (oldflags & EXT4_EXTENTS_FL) {
+ /* We don't support clearning extent flags */
+ if (!(flags & EXT4_EXTENTS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (flags & EXT4_EXTENTS_FL) {
+ /* migrate the file */
+ migrate = 1;
+ flags &= ~EXT4_EXTENTS_FL;
+ }
+
+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto flags_out;
+ }
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto flags_err;
+
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
+
+ ext4_set_inode_flags(inode);
+ inode->i_ctime = ext4_current_time(inode);
+
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+ ext4_journal_stop(handle);
+ if (err)
+ goto flags_out;
+
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+ err = ext4_change_inode_journal_flag(inode, jflag);
+ if (err)
+ goto flags_out;
+ if (migrate)
+ err = ext4_ext_migrate(inode);
+flags_out:
+ mutex_unlock(&inode->i_mutex);
+ mnt_drop_write_file(filp);
+ return err;
+ }
+ case EXT4_IOC_GETVERSION:
+ case EXT4_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int __user *) arg);
+ case EXT4_IOC_SETVERSION:
+ case EXT4_IOC_SETVERSION_OLD: {
+ handle_t *handle;
+ struct ext4_iloc iloc;
+ __u32 generation;
+ int err;
+
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto unlock_out;
+ }
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err == 0) {
+ inode->i_ctime = ext4_current_time(inode);
+ inode->i_generation = generation;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+ ext4_journal_stop(handle);
+
+unlock_out:
+ mutex_unlock(&inode->i_mutex);
+setversion_out:
+ mnt_drop_write_file(filp);
+ return err;
+ }
+ case EXT4_IOC_GROUP_EXTEND: {
+ ext4_fsblk_t n_blocks_count;
+ int err, err2=0;
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_extend_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto group_extend_out;
+
+ err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+group_extend_out:
+ ext4_resize_end(sb);
+ return err;
+ }
+
+ case EXT4_IOC_MOVE_EXT: {
+ struct move_extent me;
+ struct file *donor_filp;
+ int err;
+
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
+ return -EFAULT;
+ me.moved_len = 0;
+
+ donor_filp = fget(me.donor_fd);
+ if (!donor_filp)
+ return -EBADF;
+
+ if (!(donor_filp->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto mext_out;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto mext_out;
+
+ err = ext4_move_extents(filp, donor_filp, me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+ mnt_drop_write_file(filp);
+ mnt_drop_write(filp->f_path.mnt);
+
+ if (copy_to_user((struct move_extent __user *)arg,
+ &me, sizeof(me)))
+ err = -EFAULT;
+mext_out:
+ fput(donor_filp);
+ return err;
+ }
+
+ case EXT4_IOC_GROUP_ADD: {
+ struct ext4_new_group_data input;
+ int err, err2=0;
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not supported with bigalloc");
+ err = -EOPNOTSUPP;
+ goto group_add_out;
+ }
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ goto group_add_out;
+
+ err = ext4_group_add(sb, &input);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write_file(filp);
+group_add_out:
+ ext4_resize_end(sb);
+ return err;
+ }
+
+ case EXT4_IOC_MIGRATE:
+ {
+ int err;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ /*
+ * inode_mutex prevent write and truncate on the file.
+ * Read still goes through. We take i_data_sem in
+ * ext4_ext_swap_inode_data before we switch the
+ * inode format to prevent read.
+ */
+ mutex_lock(&(inode->i_mutex));
+ err = ext4_ext_migrate(inode);
+ mutex_unlock(&(inode->i_mutex));
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_ALLOC_DA_BLKS:
+ {
+ int err;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ err = ext4_alloc_da_blocks(inode);
+ mnt_drop_write_file(filp);
+ return err;
+ }
+
+ case EXT4_IOC_RESIZE_FS: {
+ ext4_fsblk_t n_blocks_count;
+ struct super_block *sb = inode->i_sb;
+ int err = 0, err2 = 0;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_META_BG)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online resizing not (yet) supported with meta_bg");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+ sizeof(__u64))) {
+ return -EFAULT;
+ }
+
+ if (n_blocks_count > MAX_32_NUM &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "File system only supports 32-bit block numbers");
+ return -EOPNOTSUPP;
+ }
+
+ err = ext4_resize_begin(sb);
+ if (err)
+ return err;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto resizefs_out;
+
+ err = ext4_resize_fs(sb, n_blocks_count);
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ if (err == 0)
+ err = err2;
+ mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+ ext4_resize_end(sb);
+ return err;
+ }
+
+ case FITRIM:
+ {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ struct fstrim_range range;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ ext4_msg(sb, KERN_ERR,
+ "FITRIM not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+ sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max((unsigned int)range.minlen,
+ q->limits.discard_granularity);
+ ret = ext4_trim_fs(sb, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user((struct fstrim_range __user *)arg, &range,
+ sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ /* These are just misnamed, they actually get/put from/to user an int */
+ switch (cmd) {
+ case EXT4_IOC32_GETFLAGS:
+ cmd = EXT4_IOC_GETFLAGS;
+ break;
+ case EXT4_IOC32_SETFLAGS:
+ cmd = EXT4_IOC_SETFLAGS;
+ break;
+ case EXT4_IOC32_GETVERSION:
+ cmd = EXT4_IOC_GETVERSION;
+ break;
+ case EXT4_IOC32_SETVERSION:
+ cmd = EXT4_IOC_SETVERSION;
+ break;
+ case EXT4_IOC32_GROUP_EXTEND:
+ cmd = EXT4_IOC_GROUP_EXTEND;
+ break;
+ case EXT4_IOC32_GETVERSION_OLD:
+ cmd = EXT4_IOC_GETVERSION_OLD;
+ break;
+ case EXT4_IOC32_SETVERSION_OLD:
+ cmd = EXT4_IOC_SETVERSION_OLD;
+ break;
+ case EXT4_IOC32_GETRSVSZ:
+ cmd = EXT4_IOC_GETRSVSZ;
+ break;
+ case EXT4_IOC32_SETRSVSZ:
+ cmd = EXT4_IOC_SETRSVSZ;
+ break;
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_input input;
+ mm_segment_t old_fs;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+ (unsigned long) &input);
+ set_fs(old_fs);
+ return err;
+ }
+ case EXT4_IOC_MOVE_EXT:
+ case FITRIM:
+ case EXT4_IOC_RESIZE_FS:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
new file mode 100644
index 00000000..6b0a57ea
--- /dev/null
+++ b/fs/ext4/mballoc.c
@@ -0,0 +1,5047 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+
+/*
+ * mballoc.c contains the multiblocks allocation routines
+ */
+
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <trace/events/ext4.h>
+
+/*
+ * MUSTDO:
+ * - test ext4_ext_search_left() and ext4_ext_search_right()
+ * - search for metadata in few groups
+ *
+ * TODO v4:
+ * - normalization should take into account whether file is still open
+ * - discard preallocations if no free space left (policy?)
+ * - don't normalize tails
+ * - quota
+ * - reservation for superuser
+ *
+ * TODO v3:
+ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ * - track min/max extents in each group for better group selection
+ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - tree of groups sorted by number of free blocks
+ * - error handling
+ */
+
+/*
+ * The allocation request involve request for multiple number of blocks
+ * near to the goal(block) value specified.
+ *
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
+ *
+ * The main motivation for having small file use group preallocation is to
+ * ensure that we have small files closer together on the disk.
+ *
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
+ *
+ * pa_lstart -> the logical start block for this prealloc space
+ * pa_pstart -> the physical start block for this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
+ *
+ * The inode preallocation space is used looking at the _logical_ start
+ * block. If only the logical file block falls within the range of prealloc
+ * space we will consume the particular prealloc space. This makes sure that
+ * we have contiguous physical blocks representing the file blocks
+ *
+ * The important thing to be noted in case of inode prealloc space is that
+ * we don't modify the values associated to inode prealloc space except
+ * pa_free.
+ *
+ * If we are not able to find blocks in the inode prealloc space and if we
+ * have the group allocation flag set then we look at the locality group
+ * prealloc space. These are per CPU prealloc list represented as
+ *
+ * ext4_sb_info.s_locality_groups[smp_processor_id()]
+ *
+ * The reason for having a per cpu locality group is to reduce the contention
+ * between CPUs. It is possible to get scheduled at this point.
+ *
+ * The locality group prealloc space is used looking at whether we have
+ * enough free space (pa_free) within the prealloc space.
+ *
+ * If we can't allocate blocks via inode prealloc or/and locality group
+ * prealloc then we look at the buddy cache. The buddy cache is represented
+ * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
+ * mapped to the buddy and bitmap information regarding different
+ * groups. The buddy information is attached to buddy cache inode so that
+ * we can access them through the page cache. The information regarding
+ * each group is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are stored in the
+ * inode as:
+ *
+ * { page }
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information. So for each group we
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * blocksize) blocks. So it can have information regarding groups_per_page
+ * which is blocks_per_page/2
+ *
+ * The buddy cache inode is not stored on disk. The inode is thrown
+ * away when the filesystem is unmounted.
+ *
+ * We look for count number of blocks in the buddy cache. If we were able
+ * to locate that many free blocks we return with additional information
+ * regarding rest of the contiguous physical block available
+ *
+ * Before allocating blocks via buddy cache we normalize the request
+ * blocks. This ensure we ask for more blocks that we needed. The extra
+ * blocks that we get after allocation is added to the respective prealloc
+ * list. In case of inode preallocation we follow a list of heuristics
+ * based on file size. This can be found in ext4_mb_normalize_request. If
+ * we are doing a group prealloc we try to normalize the request to
+ * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
+ * 512 blocks. This can be tuned via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
+ * terms of number of blocks. If we have mounted the file system with -O
+ * stripe=<value> option the group prealloc request is normalized to the
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
+ *
+ * The regular allocator (using the buddy cache) supports a few tunables.
+ *
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
+ *
+ * The regular allocator uses buddy scan only if the request len is power of
+ * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
+ * value of s_mb_order2_reqs can be tuned via
+ * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
+ * min_to_scan indicate how long the mballoc __must__ look for a best
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
+ * best extent in the found extents. Searching for the blocks starts with
+ * the group specified as the goal value in allocation context via
+ * ac_g_ex. Each group is first checked based on the criteria whether it
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
+ * checked.
+ *
+ * Both the prealloc space are getting populated as above. So for the first
+ * request we will hit the buddy cache which will result in this prealloc
+ * space getting filled. The prealloc space is then later used for the
+ * subsequent request.
+ */
+
+/*
+ * mballoc operates on the following data:
+ * - on-disk bitmap
+ * - in-core buddy (actually includes buddy and bitmap)
+ * - preallocation descriptors (PAs)
+ *
+ * there are two types of preallocations:
+ * - inode
+ * assiged to specific inode and can be used for this inode only.
+ * it describes part of inode's space preallocated to specific
+ * physical blocks. any block from that preallocated can be used
+ * independent. the descriptor just tracks number of blocks left
+ * unused. so, before taking some block from descriptor, one must
+ * make sure corresponded logical block isn't allocated yet. this
+ * also means that freeing any block within descriptor's range
+ * must discard all preallocated blocks.
+ * - locality group
+ * assigned to specific locality group which does not translate to
+ * permanent set of inodes: inode can join and leave group. space
+ * from this type of preallocation can be used for any inode. thus
+ * it's consumed from the beginning to the end.
+ *
+ * relation between them can be expressed as:
+ * in-core buddy = on-disk bitmap + preallocation descriptors
+ *
+ * this mean blocks mballoc considers used are:
+ * - allocated blocks (persistent)
+ * - preallocated blocks (non-persistent)
+ *
+ * consistency in mballoc world means that at any time a block is either
+ * free or used in ALL structures. notice: "any time" should not be read
+ * literally -- time is discrete and delimited by locks.
+ *
+ * to keep it simple, we don't use block numbers, instead we count number of
+ * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
+ *
+ * all operations can be expressed as:
+ * - init buddy: buddy = on-disk + PAs
+ * - new PA: buddy += N; PA = N
+ * - use inode PA: on-disk += N; PA -= N
+ * - discard inode PA buddy -= on-disk - PA; PA = 0
+ * - use locality group PA on-disk += N; PA -= N
+ * - discard locality group PA buddy -= PA; PA = 0
+ * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
+ * is used in real operation because we can't know actual used
+ * bits from PA, only from on-disk bitmap
+ *
+ * if we follow this strict logic, then all operations above should be atomic.
+ * given some of them can block, we'd have to use something like semaphores
+ * killing performance on high-end SMP hardware. let's try to relax it using
+ * the following knowledge:
+ * 1) if buddy is referenced, it's already initialized
+ * 2) while block is used in buddy and the buddy is referenced,
+ * nobody can re-allocate that block
+ * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
+ * bit set and PA claims same block, it's OK. IOW, one can set bit in
+ * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
+ * block
+ *
+ * so, now we're building a concurrency table:
+ * - init buddy vs.
+ * - new PA
+ * blocks for PA are allocated in the buddy, buddy must be referenced
+ * until PA is linked to allocation group to avoid concurrent buddy init
+ * - use inode PA
+ * we need to make sure that either on-disk bitmap or PA has uptodate data
+ * given (3) we care that PA-=N operation doesn't interfere with init
+ * - discard inode PA
+ * the simplest way would be to have buddy initialized by the discard
+ * - use locality group PA
+ * again PA-=N must be serialized with init
+ * - discard locality group PA
+ * the simplest way would be to have buddy initialized by the discard
+ * - new PA vs.
+ * - use inode PA
+ * i_data_sem serializes them
+ * - discard inode PA
+ * discard process must wait until PA isn't used by another process
+ * - use locality group PA
+ * some mutex should serialize them
+ * - discard locality group PA
+ * discard process must wait until PA isn't used by another process
+ * - use inode PA
+ * - use inode PA
+ * i_data_sem or another mutex should serializes them
+ * - discard inode PA
+ * discard process must wait until PA isn't used by another process
+ * - use locality group PA
+ * nothing wrong here -- they're different PAs covering different blocks
+ * - discard locality group PA
+ * discard process must wait until PA isn't used by another process
+ *
+ * now we're ready to make few consequences:
+ * - PA is referenced and while it is no discard is possible
+ * - PA is referenced until block isn't marked in on-disk bitmap
+ * - PA changes only after on-disk bitmap
+ * - discard must not compete with init. either init is done before
+ * any discard or they're serialized somehow
+ * - buddy init as sum of on-disk bitmap and PAs is done atomically
+ *
+ * a special case when we've used PA to emptiness. no need to modify buddy
+ * in this case, but we should care about concurrent init
+ *
+ */
+
+ /*
+ * Logic in few words:
+ *
+ * - allocation:
+ * load group
+ * find blocks
+ * mark bits in on-disk bitmap
+ * release group
+ *
+ * - use preallocation:
+ * find proper PA (per-inode or group)
+ * load group
+ * mark bits in on-disk bitmap
+ * release group
+ * release PA
+ *
+ * - free:
+ * load group
+ * mark bits in on-disk bitmap
+ * release group
+ *
+ * - discard preallocations in group:
+ * mark PAs deleted
+ * move them onto local list
+ * load on-disk bitmap
+ * load group
+ * remove PA from object (inode or locality group)
+ * mark free blocks in-core
+ *
+ * - discard inode's preallocations:
+ */
+
+/*
+ * Locking rules
+ *
+ * Locks:
+ * - bitlock on a group (group)
+ * - object (inode/locality) (object)
+ * - per-pa lock (pa)
+ *
+ * Paths:
+ * - new pa
+ * object
+ * group
+ *
+ * - find and use pa:
+ * pa
+ *
+ * - release consumed pa:
+ * pa
+ * group
+ * object
+ *
+ * - generate in-core bitmap:
+ * group
+ * pa
+ *
+ * - discard all for given object (inode, locality group):
+ * object
+ * pa
+ * group
+ *
+ * - discard all for given group:
+ * group
+ * pa
+ * group
+ * object
+ *
+ */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size. There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES 8
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+ "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+ "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+ "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);
+
+static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+ *bit += ((unsigned long) addr & 7UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+ *bit += ((unsigned long) addr & 3UL) << 3;
+ addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+ return addr;
+}
+
+static inline int mb_test_bit(int bit, void *addr)
+{
+ /*
+ * ext4_test_bit on architecture like powerpc
+ * needs unsigned long aligned address
+ */
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ return ext4_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ ext4_set_bit(bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ ext4_clear_bit(bit, addr);
+}
+
+static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
+ int fix = 0, ret, tmpmax;
+ addr = mb_correct_addr_and_bit(&fix, addr);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
+static inline int mb_find_next_bit(void *addr, int max, int start)
+{
+ int fix = 0, ret, tmpmax;
+ addr = mb_correct_addr_and_bit(&fix, addr);
+ tmpmax = max + fix;
+ start += fix;
+
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
+}
+
+static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
+{
+ char *bb;
+
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
+ BUG_ON(max == NULL);
+
+ if (order > e4b->bd_blkbits + 1) {
+ *max = 0;
+ return NULL;
+ }
+
+ /* at order 0 we see each particular block */
+ if (order == 0) {
+ *max = 1 << (e4b->bd_blkbits + 3);
+ return e4b->bd_bitmap;
+ }
+
+ bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+ *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
+
+ return bb;
+}
+
+#ifdef DOUBLE_CHECK
+static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
+ int first, int count)
+{
+ int i;
+ struct super_block *sb = e4b->bd_sb;
+
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+ return;
+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ for (i = 0; i < count; i++) {
+ if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
+ ext4_fsblk_t blocknr;
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing block already freed "
+ "(bit %u)",
+ first + i);
+ }
+ mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
+ }
+}
+
+static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
+{
+ int i;
+
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+ return;
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+ for (i = 0; i < count; i++) {
+ BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
+ mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
+ }
+}
+
+static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+ if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
+ unsigned char *b1, *b2;
+ int i;
+ b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
+ b2 = (unsigned char *) bitmap;
+ for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
+ if (b1[i] != b2[i]) {
+ ext4_msg(e4b->bd_sb, KERN_ERR,
+ "corruption in group %u "
+ "at byte %u(%u): %x in copy != %x "
+ "on disk/prealloc",
+ e4b->bd_group, i, i * 8, b1[i], b2[i]);
+ BUG();
+ }
+ }
+ }
+}
+
+#else
+static inline void mb_free_blocks_double(struct inode *inode,
+ struct ext4_buddy *e4b, int first, int count)
+{
+ return;
+}
+static inline void mb_mark_used_double(struct ext4_buddy *e4b,
+ int first, int count)
+{
+ return;
+}
+static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
+{
+ return;
+}
+#endif
+
+#ifdef AGGRESSIVE_CHECK
+
+#define MB_CHECK_ASSERT(assert) \
+do { \
+ if (!(assert)) { \
+ printk(KERN_EMERG \
+ "Assertion failure in %s() at %s:%d: \"%s\"\n", \
+ function, file, line, # assert); \
+ BUG(); \
+ } \
+} while (0)
+
+static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+ const char *function, int line)
+{
+ struct super_block *sb = e4b->bd_sb;
+ int order = e4b->bd_blkbits + 1;
+ int max;
+ int max2;
+ int i;
+ int j;
+ int k;
+ int count;
+ struct ext4_group_info *grp;
+ int fragments = 0;
+ int fstart;
+ struct list_head *cur;
+ void *buddy;
+ void *buddy2;
+
+ {
+ static int mb_check_counter;
+ if (mb_check_counter++ % 100 != 0)
+ return 0;
+ }
+
+ while (order > 1) {
+ buddy = mb_find_buddy(e4b, order, &max);
+ MB_CHECK_ASSERT(buddy);
+ buddy2 = mb_find_buddy(e4b, order - 1, &max2);
+ MB_CHECK_ASSERT(buddy2);
+ MB_CHECK_ASSERT(buddy != buddy2);
+ MB_CHECK_ASSERT(max * 2 == max2);
+
+ count = 0;
+ for (i = 0; i < max; i++) {
+
+ if (mb_test_bit(i, buddy)) {
+ /* only single bit in buddy2 may be 1 */
+ if (!mb_test_bit(i << 1, buddy2)) {
+ MB_CHECK_ASSERT(
+ mb_test_bit((i<<1)+1, buddy2));
+ } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
+ MB_CHECK_ASSERT(
+ mb_test_bit(i << 1, buddy2));
+ }
+ continue;
+ }
+
+ /* both bits in buddy2 must be 1 */
+ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
+ MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
+
+ for (j = 0; j < (1 << order); j++) {
+ k = (i * (1 << order)) + j;
+ MB_CHECK_ASSERT(
+ !mb_test_bit(k, e4b->bd_bitmap));
+ }
+ count++;
+ }
+ MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
+ order--;
+ }
+
+ fstart = -1;
+ buddy = mb_find_buddy(e4b, 0, &max);
+ for (i = 0; i < max; i++) {
+ if (!mb_test_bit(i, buddy)) {
+ MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
+ if (fstart == -1) {
+ fragments++;
+ fstart = i;
+ }
+ continue;
+ }
+ fstart = -1;
+ /* check used bits only */
+ for (j = 0; j < e4b->bd_blkbits + 1; j++) {
+ buddy2 = mb_find_buddy(e4b, j, &max2);
+ k = i >> j;
+ MB_CHECK_ASSERT(k < max2);
+ MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
+ }
+ }
+ MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
+ MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
+
+ grp = ext4_get_group_info(sb, e4b->bd_group);
+ list_for_each(cur, &grp->bb_prealloc_list) {
+ ext4_group_t groupnr;
+ struct ext4_prealloc_space *pa;
+ pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
+ MB_CHECK_ASSERT(groupnr == e4b->bd_group);
+ for (i = 0; i < pa->pa_len; i++)
+ MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
+ }
+ return 0;
+}
+#undef MB_CHECK_ASSERT
+#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
+ __FILE__, __func__, __LINE__)
+#else
+#define mb_check_buddy(e4b)
+#endif
+
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
+static void ext4_mb_mark_free_simple(struct super_block *sb,
+ void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
+ struct ext4_group_info *grp)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_grpblk_t min;
+ ext4_grpblk_t max;
+ ext4_grpblk_t chunk;
+ unsigned short border;
+
+ BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
+
+ border = 2 << sb->s_blocksize_bits;
+
+ while (len > 0) {
+ /* find how many blocks can be covered since this position */
+ max = ffs(first | border) - 1;
+
+ /* find how many blocks of power 2 we need to mark */
+ min = fls(len) - 1;
+
+ if (max < min)
+ min = max;
+ chunk = 1 << min;
+
+ /* mark multiblock chunks only */
+ grp->bb_counters[min]++;
+ if (min > 0)
+ mb_clear_bit(first >> min,
+ buddy + sbi->s_mb_offsets[min]);
+
+ len -= chunk;
+ first += chunk;
+ }
+}
+
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+ int i;
+ int bits;
+
+ grp->bb_largest_free_order = -1; /* uninit */
+
+ bits = sb->s_blocksize_bits + 1;
+ for (i = bits; i >= 0; i--) {
+ if (grp->bb_counters[i] > 0) {
+ grp->bb_largest_free_order = i;
+ break;
+ }
+ }
+}
+
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
+ void *buddy, void *bitmap, ext4_group_t group)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_grpblk_t first;
+ ext4_grpblk_t len;
+ unsigned free = 0;
+ unsigned fragments = 0;
+ unsigned long long period = get_cycles();
+
+ /* initialize buddy from bitmap which is aggregation
+ * of on-disk bitmap and preallocations */
+ i = mb_find_next_zero_bit(bitmap, max, 0);
+ grp->bb_first_free = i;
+ while (i < max) {
+ fragments++;
+ first = i;
+ i = mb_find_next_bit(bitmap, max, i);
+ len = i - first;
+ free += len;
+ if (len > 1)
+ ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
+ else
+ grp->bb_counters[0]++;
+ if (i < max)
+ i = mb_find_next_zero_bit(bitmap, max, i);
+ }
+ grp->bb_fragments = fragments;
+
+ if (free != grp->bb_free) {
+ ext4_grp_locked_error(sb, group, 0, 0,
+ "%u clusters in bitmap, %u in gd",
+ free, grp->bb_free);
+ /*
+ * If we intent to continue, we consider group descritor
+ * corrupt and update bb_free using bitmap value
+ */
+ grp->bb_free = free;
+ }
+ mb_set_largest_free_order(sb, grp);
+
+ clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+ period = get_cycles() - period;
+ spin_lock(&EXT4_SB(sb)->s_bal_lock);
+ EXT4_SB(sb)->s_mb_buddies_generated++;
+ EXT4_SB(sb)->s_mb_generation_time += period;
+ spin_unlock(&EXT4_SB(sb)->s_bal_lock);
+}
+
+/* The buddy information is attached the buddy cache inode
+ * for convenience. The information regarding each group
+ * is loaded via ext4_mb_load_buddy. The information involve
+ * block bitmap and buddy information. The information are
+ * stored in the inode as
+ *
+ * { page }
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
+ *
+ *
+ * one block each for bitmap and buddy information.
+ * So for each group we take up 2 blocks. A page can
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
+ *
+ * Locking note: This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
+ */
+
+static int ext4_mb_init_cache(struct page *page, char *incore)
+{
+ ext4_group_t ngroups;
+ int blocksize;
+ int blocks_per_page;
+ int groups_per_page;
+ int err = 0;
+ int i;
+ ext4_group_t first_group, group;
+ int first_block;
+ struct super_block *sb;
+ struct buffer_head *bhs;
+ struct buffer_head **bh;
+ struct inode *inode;
+ char *data;
+ char *bitmap;
+ struct ext4_group_info *grinfo;
+
+ mb_debug(1, "init page %lu\n", page->index);
+
+ inode = page->mapping->host;
+ sb = inode->i_sb;
+ ngroups = ext4_get_groups_count(sb);
+ blocksize = 1 << inode->i_blkbits;
+ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+
+ /* allocate buffer_heads to read bitmaps */
+ if (groups_per_page > 1) {
+ i = sizeof(struct buffer_head *) * groups_per_page;
+ bh = kzalloc(i, GFP_NOFS);
+ if (bh == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ } else
+ bh = &bhs;
+
+ first_group = page->index * blocks_per_page / 2;
+
+ /* read all groups the page covers into the cache */
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (group >= ngroups)
+ break;
+
+ grinfo = ext4_get_group_info(sb, group);
+ /*
+ * If page is uptodate then we came here after online resize
+ * which added some new uninitialized group info structs, so
+ * we must skip all initialized uptodate buddies on the page,
+ * which may be currently in use by an allocating task.
+ */
+ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ bh[i] = NULL;
+ continue;
+ }
+ if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mb_debug(1, "read bitmap for group %u\n", group);
+ }
+
+ /* wait for I/O completion */
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ first_block = page->index * blocks_per_page;
+ for (i = 0; i < blocks_per_page; i++) {
+ int group;
+
+ group = (first_block + i) >> 1;
+ if (group >= ngroups)
+ break;
+
+ if (!bh[group - first_group])
+ /* skip initialized uptodate buddy */
+ continue;
+
+ /*
+ * data carry information regarding this
+ * particular group in the format specified
+ * above
+ *
+ */
+ data = page_address(page) + (i * blocksize);
+ bitmap = bh[group - first_group]->b_data;
+
+ /*
+ * We place the buddy block and bitmap block
+ * close together
+ */
+ if ((first_block + i) & 1) {
+ /* this is block of buddy */
+ BUG_ON(incore == NULL);
+ mb_debug(1, "put buddy for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ trace_ext4_mb_buddy_bitmap_load(sb, group);
+ grinfo = ext4_get_group_info(sb, group);
+ grinfo->bb_fragments = 0;
+ memset(grinfo->bb_counters, 0,
+ sizeof(*grinfo->bb_counters) *
+ (sb->s_blocksize_bits+2));
+ /*
+ * incore got set to the group block bitmap below
+ */
+ ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
+ ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
+ incore = NULL;
+ } else {
+ /* this is block of bitmap */
+ BUG_ON(incore != NULL);
+ mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
+ group, page->index, i * blocksize);
+ trace_ext4_mb_bitmap_load(sb, group);
+
+ /* see comments in ext4_mb_put_pa() */
+ ext4_lock_group(sb, group);
+ memcpy(data, bitmap, blocksize);
+
+ /* mark all preallocated blks used in in-core bitmap */
+ ext4_mb_generate_from_pa(sb, data, group);
+ ext4_mb_generate_from_freelist(sb, data, group);
+ ext4_unlock_group(sb, group);
+
+ /* set incore so that the buddy information can be
+ * generated using this
+ */
+ incore = data;
+ }
+ }
+ SetPageUptodate(page);
+
+out:
+ if (bh) {
+ for (i = 0; i < groups_per_page; i++)
+ brelse(bh[i]);
+ if (bh != &bhs)
+ kfree(bh);
+ }
+ return err;
+}
+
+/*
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ */
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b)
+{
+ struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+ int block, pnum, poff;
+ int blocks_per_page;
+ struct page *page;
+
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -EIO;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ if (blocks_per_page >= 2) {
+ /* buddy and bitmap are on the same page */
+ return 0;
+ }
+
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -EIO;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_buddy_page = page;
+ return 0;
+}
+
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+{
+ if (e4b->bd_bitmap_page) {
+ unlock_page(e4b->bd_bitmap_page);
+ page_cache_release(e4b->bd_bitmap_page);
+ }
+ if (e4b->bd_buddy_page) {
+ unlock_page(e4b->bd_buddy_page);
+ page_cache_release(e4b->bd_buddy_page);
+ }
+}
+
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ struct ext4_group_info *this_grp;
+ struct ext4_buddy e4b;
+ struct page *page;
+ int ret = 0;
+
+ mb_debug(1, "init group %u\n", group);
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures that we don't reinit the buddy cache
+ * page which map to the group from which we are already
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have pinned buddy page to page cache.
+ */
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ goto err;
+ }
+
+ page = e4b.bd_bitmap_page;
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+
+ if (e4b.bd_buddy_page == NULL) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ ret = 0;
+ goto err;
+ }
+ /* init buddy cache */
+ page = e4b.bd_buddy_page;
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_page_lock(&e4b);
+ return ret;
+}
+
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack int
+ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+{
+ int blocks_per_page;
+ int block;
+ int pnum;
+ int poff;
+ struct page *page;
+ int ret;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+
+ mb_debug(1, "load group %u\n", group);
+
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ grp = ext4_get_group_info(sb, group);
+
+ e4b->bd_blkbits = sb->s_blocksize_bits;
+ e4b->bd_info = grp;
+ e4b->bd_sb = sb;
+ e4b->bd_group = group;
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
+
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+
+ /* we could use find_or_create_page(), but it locks page
+ * what we'd like to avoid in fast path ... */
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page == NULL || !PageUptodate(page)) {
+ if (page)
+ /*
+ * drop the page reference and try
+ * to get the page with lock. If we
+ * are not uptodate that implies
+ * somebody just created the page but
+ * is yet to initialize the same. So
+ * wait for it to initialize.
+ */
+ page_cache_release(page);
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ mb_cmp_bitmaps(e4b, page_address(page) +
+ (poff * sb->s_blocksize));
+ }
+ unlock_page(page);
+ }
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+ mark_page_accessed(page);
+
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page == NULL || !PageUptodate(page)) {
+ if (page)
+ page_cache_release(page);
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
+ unlock_page(page);
+ }
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ e4b->bd_buddy_page = page;
+ e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
+ mark_page_accessed(page);
+
+ BUG_ON(e4b->bd_bitmap_page == NULL);
+ BUG_ON(e4b->bd_buddy_page == NULL);
+
+ return 0;
+
+err:
+ if (page)
+ page_cache_release(page);
+ if (e4b->bd_bitmap_page)
+ page_cache_release(e4b->bd_bitmap_page);
+ if (e4b->bd_buddy_page)
+ page_cache_release(e4b->bd_buddy_page);
+ e4b->bd_buddy = NULL;
+ e4b->bd_bitmap = NULL;
+ return ret;
+}
+
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
+{
+ if (e4b->bd_bitmap_page)
+ page_cache_release(e4b->bd_bitmap_page);
+ if (e4b->bd_buddy_page)
+ page_cache_release(e4b->bd_buddy_page);
+}
+
+
+static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
+{
+ int order = 1;
+ void *bb;
+
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
+ BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
+
+ bb = e4b->bd_buddy;
+ while (order <= e4b->bd_blkbits + 1) {
+ block = block >> 1;
+ if (!mb_test_bit(block, bb)) {
+ /* this block is part of buddy of order 'order' */
+ return order;
+ }
+ bb += 1 << (e4b->bd_blkbits - order);
+ order++;
+ }
+ return 0;
+}
+
+static void mb_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ mb_clear_bit(cur, bm);
+ cur++;
+ }
+}
+
+void ext4_set_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: set whole word at once */
+ addr = bm + (cur >> 3);
+ *addr = 0xffffffff;
+ cur += 32;
+ continue;
+ }
+ mb_set_bit(cur, bm);
+ cur++;
+ }
+}
+
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+ int first, int count)
+{
+ int block = 0;
+ int max = 0;
+ int order;
+ void *buddy;
+ void *buddy2;
+ struct super_block *sb = e4b->bd_sb;
+
+ BUG_ON(first + count > (sb->s_blocksize << 3));
+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ mb_check_buddy(e4b);
+ mb_free_blocks_double(inode, e4b, first, count);
+
+ e4b->bd_info->bb_free += count;
+ if (first < e4b->bd_info->bb_first_free)
+ e4b->bd_info->bb_first_free = first;
+
+ /* let's maintain fragments counter */
+ if (first != 0)
+ block = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
+ max = !mb_test_bit(first + count, e4b->bd_bitmap);
+ if (block && max)
+ e4b->bd_info->bb_fragments--;
+ else if (!block && !max)
+ e4b->bd_info->bb_fragments++;
+
+ /* let's maintain buddy itself */
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
+ if (!mb_test_bit(block, e4b->bd_bitmap)) {
+ ext4_fsblk_t blocknr;
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u)", block);
+ }
+ mb_clear_bit(block, e4b->bd_bitmap);
+ e4b->bd_info->bb_counters[order]++;
+
+ /* start of the buddy */
+ buddy = mb_find_buddy(e4b, order, &max);
+
+ do {
+ block &= ~1UL;
+ if (mb_test_bit(block, buddy) ||
+ mb_test_bit(block + 1, buddy))
+ break;
+
+ /* both the buddies are free, try to coalesce them */
+ buddy2 = mb_find_buddy(e4b, order + 1, &max);
+
+ if (!buddy2)
+ break;
+
+ if (order > 0) {
+ /* for special purposes, we don't set
+ * free bits in bitmap */
+ mb_set_bit(block, buddy);
+ mb_set_bit(block + 1, buddy);
+ }
+ e4b->bd_info->bb_counters[order]--;
+ e4b->bd_info->bb_counters[order]--;
+
+ block = block >> 1;
+ order++;
+ e4b->bd_info->bb_counters[order]++;
+
+ mb_clear_bit(block, buddy2);
+ buddy = buddy2;
+ } while (1);
+ }
+ mb_set_largest_free_order(sb, e4b->bd_info);
+ mb_check_buddy(e4b);
+}
+
+static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+ int needed, struct ext4_free_extent *ex)
+{
+ int next = block;
+ int max;
+ void *buddy;
+
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+ BUG_ON(ex == NULL);
+
+ buddy = mb_find_buddy(e4b, order, &max);
+ BUG_ON(buddy == NULL);
+ BUG_ON(block >= max);
+ if (mb_test_bit(block, buddy)) {
+ ex->fe_len = 0;
+ ex->fe_start = 0;
+ ex->fe_group = 0;
+ return 0;
+ }
+
+ /* FIXME dorp order completely ? */
+ if (likely(order == 0)) {
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
+ }
+
+ ex->fe_len = 1 << order;
+ ex->fe_start = block << order;
+ ex->fe_group = e4b->bd_group;
+
+ /* calc difference from given start */
+ next = next - ex->fe_start;
+ ex->fe_len -= next;
+ ex->fe_start += next;
+
+ while (needed > ex->fe_len &&
+ (buddy = mb_find_buddy(e4b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+ next = (block + 1) * (1 << order);
+ if (mb_test_bit(next, e4b->bd_bitmap))
+ break;
+
+ order = mb_find_order_for_block(e4b, next);
+
+ block = next >> order;
+ ex->fe_len += 1 << order;
+ }
+
+ BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+ return ex->fe_len;
+}
+
+static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
+{
+ int ord;
+ int mlen = 0;
+ int max = 0;
+ int cur;
+ int start = ex->fe_start;
+ int len = ex->fe_len;
+ unsigned ret = 0;
+ int len0 = len;
+ void *buddy;
+
+ BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
+ BUG_ON(e4b->bd_group != ex->fe_group);
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
+ mb_check_buddy(e4b);
+ mb_mark_used_double(e4b, start, len);
+
+ e4b->bd_info->bb_free -= len;
+ if (e4b->bd_info->bb_first_free == start)
+ e4b->bd_info->bb_first_free += len;
+
+ /* let's maintain fragments counter */
+ if (start != 0)
+ mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
+ if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
+ max = !mb_test_bit(start + len, e4b->bd_bitmap);
+ if (mlen && max)
+ e4b->bd_info->bb_fragments++;
+ else if (!mlen && !max)
+ e4b->bd_info->bb_fragments--;
+
+ /* let's maintain buddy itself */
+ while (len) {
+ ord = mb_find_order_for_block(e4b, start);
+
+ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
+ /* the whole chunk may be allocated at once! */
+ mlen = 1 << ord;
+ buddy = mb_find_buddy(e4b, ord, &max);
+ BUG_ON((start >> ord) >= max);
+ mb_set_bit(start >> ord, buddy);
+ e4b->bd_info->bb_counters[ord]--;
+ start += mlen;
+ len -= mlen;
+ BUG_ON(len < 0);
+ continue;
+ }
+
+ /* store for history */
+ if (ret == 0)
+ ret = len | (ord << 16);
+
+ /* we have to split large buddy */
+ BUG_ON(ord <= 0);
+ buddy = mb_find_buddy(e4b, ord, &max);
+ mb_set_bit(start >> ord, buddy);
+ e4b->bd_info->bb_counters[ord]--;
+
+ ord--;
+ cur = (start >> ord) & ~1U;
+ buddy = mb_find_buddy(e4b, ord, &max);
+ mb_clear_bit(cur, buddy);
+ mb_clear_bit(cur + 1, buddy);
+ e4b->bd_info->bb_counters[ord]++;
+ e4b->bd_info->bb_counters[ord]++;
+ }
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_check_buddy(e4b);
+
+ return ret;
+}
+
+/*
+ * Must be called under group lock!
+ */
+static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int ret;
+
+ BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
+ BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
+ ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
+ ret = mb_mark_used(e4b, &ac->ac_b_ex);
+
+ /* preallocation can change ac_b_ex, thus we store actually
+ * allocated blocks for history */
+ ac->ac_f_ex = ac->ac_b_ex;
+
+ ac->ac_status = AC_STATUS_FOUND;
+ ac->ac_tail = ret & 0xffff;
+ ac->ac_buddy = ret >> 16;
+
+ /*
+ * take the page reference. We want the page to be pinned
+ * so that we don't get a ext4_mb_init_cache_call for this
+ * group until we update the bitmap. That would mean we
+ * double allocate blocks. The reference is dropped
+ * in ext4_mb_release_context
+ */
+ ac->ac_bitmap_page = e4b->bd_bitmap_page;
+ get_page(ac->ac_bitmap_page);
+ ac->ac_buddy_page = e4b->bd_buddy_page;
+ get_page(ac->ac_buddy_page);
+ /* store last allocated for subsequent stream allocation */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+ spin_lock(&sbi->s_md_lock);
+ sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+ sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+ spin_unlock(&sbi->s_md_lock);
+ }
+}
+
+/*
+ * regular allocator, for general purposes allocation
+ */
+
+static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b,
+ int finish_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_free_extent *bex = &ac->ac_b_ex;
+ struct ext4_free_extent *gex = &ac->ac_g_ex;
+ struct ext4_free_extent ex;
+ int max;
+
+ if (ac->ac_status == AC_STATUS_FOUND)
+ return;
+ /*
+ * We don't want to scan for a whole year
+ */
+ if (ac->ac_found > sbi->s_mb_max_to_scan &&
+ !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+ ac->ac_status = AC_STATUS_BREAK;
+ return;
+ }
+
+ /*
+ * Haven't found good chunk so far, let's continue
+ */
+ if (bex->fe_len < gex->fe_len)
+ return;
+
+ if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+ && bex->fe_group == e4b->bd_group) {
+ /* recheck chunk's availability - we don't know
+ * when it was found (within this lock-unlock
+ * period or not) */
+ max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ if (max >= gex->fe_len) {
+ ext4_mb_use_best_found(ac, e4b);
+ return;
+ }
+ }
+}
+
+/*
+ * The routine checks whether found extent is good enough. If it is,
+ * then the extent gets marked used and flag is set to the context
+ * to stop scanning. Otherwise, the extent is compared with the
+ * previous found extent and if new one is better, then it's stored
+ * in the context. Later, the best found extent will be used, if
+ * mballoc can't find good enough extent.
+ *
+ * FIXME: real allocation policy is to be designed yet!
+ */
+static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
+ struct ext4_free_extent *ex,
+ struct ext4_buddy *e4b)
+{
+ struct ext4_free_extent *bex = &ac->ac_b_ex;
+ struct ext4_free_extent *gex = &ac->ac_g_ex;
+
+ BUG_ON(ex->fe_len <= 0);
+ BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
+
+ ac->ac_found++;
+
+ /*
+ * The special case - take what you catch first
+ */
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+ *bex = *ex;
+ ext4_mb_use_best_found(ac, e4b);
+ return;
+ }
+
+ /*
+ * Let's check whether the chuck is good enough
+ */
+ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ ext4_mb_use_best_found(ac, e4b);
+ return;
+ }
+
+ /*
+ * If this is first found extent, just store it in the context
+ */
+ if (bex->fe_len == 0) {
+ *bex = *ex;
+ return;
+ }
+
+ /*
+ * If new found extent is better, store it in the context
+ */
+ if (bex->fe_len < gex->fe_len) {
+ /* if the request isn't satisfied, any found extent
+ * larger than previous best one is better */
+ if (ex->fe_len > bex->fe_len)
+ *bex = *ex;
+ } else if (ex->fe_len > gex->fe_len) {
+ /* if the request is satisfied, then we try to find
+ * an extent that still satisfy the request, but is
+ * smaller than previous one */
+ if (ex->fe_len < bex->fe_len)
+ *bex = *ex;
+ }
+
+ ext4_mb_check_limits(ac, e4b, 0);
+}
+
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b)
+{
+ struct ext4_free_extent ex = ac->ac_b_ex;
+ ext4_group_t group = ex.fe_group;
+ int max;
+ int err;
+
+ BUG_ON(ex.fe_len <= 0);
+ err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+ if (err)
+ return err;
+
+ ext4_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+
+ if (max > 0) {
+ ac->ac_b_ex = ex;
+ ext4_mb_use_best_found(ac, e4b);
+ }
+
+ ext4_unlock_group(ac->ac_sb, group);
+ ext4_mb_unload_buddy(e4b);
+
+ return 0;
+}
+
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b)
+{
+ ext4_group_t group = ac->ac_g_ex.fe_group;
+ int max;
+ int err;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_free_extent ex;
+
+ if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+ return 0;
+
+ err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
+ if (err)
+ return err;
+
+ ext4_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ ac->ac_g_ex.fe_len, &ex);
+
+ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+ ext4_fsblk_t start;
+
+ start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+ ex.fe_start;
+ /* use do_div to get remainder (would be 64-bit modulo) */
+ if (do_div(start, sbi->s_stripe) == 0) {
+ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext4_mb_use_best_found(ac, e4b);
+ }
+ } else if (max >= ac->ac_g_ex.fe_len) {
+ BUG_ON(ex.fe_len <= 0);
+ BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+ BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext4_mb_use_best_found(ac, e4b);
+ } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
+ /* Sometimes, caller may want to merge even small
+ * number of blocks to an existing extent */
+ BUG_ON(ex.fe_len <= 0);
+ BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
+ BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
+ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext4_mb_use_best_found(ac, e4b);
+ }
+ ext4_unlock_group(ac->ac_sb, group);
+ ext4_mb_unload_buddy(e4b);
+
+ return 0;
+}
+
+/*
+ * The routine scans buddy structures (not bitmap!) from given order
+ * to max order and tries to find big enough chunk to satisfy the req
+ */
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_group_info *grp = e4b->bd_info;
+ void *buddy;
+ int i;
+ int k;
+ int max;
+
+ BUG_ON(ac->ac_2order <= 0);
+ for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
+ if (grp->bb_counters[i] == 0)
+ continue;
+
+ buddy = mb_find_buddy(e4b, i, &max);
+ BUG_ON(buddy == NULL);
+
+ k = mb_find_next_zero_bit(buddy, max, 0);
+ BUG_ON(k >= max);
+
+ ac->ac_found++;
+
+ ac->ac_b_ex.fe_len = 1 << i;
+ ac->ac_b_ex.fe_start = k << i;
+ ac->ac_b_ex.fe_group = e4b->bd_group;
+
+ ext4_mb_use_best_found(ac, e4b);
+
+ BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+
+ if (EXT4_SB(sb)->s_mb_stats)
+ atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
+
+ break;
+ }
+}
+
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
+ * free blocks in the group, so the routine can know upper limit.
+ */
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b)
+{
+ struct super_block *sb = ac->ac_sb;
+ void *bitmap = e4b->bd_bitmap;
+ struct ext4_free_extent ex;
+ int i;
+ int free;
+
+ free = e4b->bd_info->bb_free;
+ BUG_ON(free <= 0);
+
+ i = e4b->bd_info->bb_first_free;
+
+ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
+ i = mb_find_next_zero_bit(bitmap,
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
+ /*
+ * IF we have corrupt bitmap, we won't find any
+ * free blocks even though group info says we
+ * we have free blocks
+ */
+ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+ "%d free clusters as per "
+ "group info. But bitmap says 0",
+ free);
+ break;
+ }
+
+ mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ BUG_ON(ex.fe_len <= 0);
+ if (free < ex.fe_len) {
+ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+ "%d free clusters as per "
+ "group info. But got %d blocks",
+ free, ex.fe_len);
+ /*
+ * The number of free blocks differs. This mostly
+ * indicate that the bitmap is corrupt. So exit
+ * without claiming the space.
+ */
+ break;
+ }
+
+ ext4_mb_measure_extent(ac, &ex, e4b);
+
+ i += ex.fe_len;
+ free -= ex.fe_len;
+ }
+
+ ext4_mb_check_limits(ac, e4b, 1);
+}
+
+/*
+ * This is a special case for storages like raid5
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
+ */
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+ struct ext4_buddy *e4b)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ void *bitmap = e4b->bd_bitmap;
+ struct ext4_free_extent ex;
+ ext4_fsblk_t first_group_block;
+ ext4_fsblk_t a;
+ ext4_grpblk_t i;
+ int max;
+
+ BUG_ON(sbi->s_stripe == 0);
+
+ /* find first stripe-aligned block in group */
+ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
+ a = first_group_block + sbi->s_stripe - 1;
+ do_div(a, sbi->s_stripe);
+ i = (a * sbi->s_stripe) - first_group_block;
+
+ while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
+ if (!mb_test_bit(i, bitmap)) {
+ max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ if (max >= sbi->s_stripe) {
+ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext4_mb_use_best_found(ac, e4b);
+ break;
+ }
+ }
+ i += sbi->s_stripe;
+ }
+}
+
+/* This is now called BEFORE we load the buddy bitmap. */
+static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr)
+{
+ unsigned free, fragments;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+
+ BUG_ON(cr < 0 || cr >= 4);
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ int ret = ext4_mb_init_group(ac->ac_sb, group);
+ if (ret)
+ return 0;
+ }
+
+ free = grp->bb_free;
+ fragments = grp->bb_fragments;
+ if (free == 0)
+ return 0;
+ if (fragments == 0)
+ return 0;
+
+ switch (cr) {
+ case 0:
+ BUG_ON(ac->ac_2order == 0);
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
+ /* Avoid using the first bg of a flexgroup for data files */
+ if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+ (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+ ((group % flex_size) == 0))
+ return 0;
+
+ return 1;
+ case 1:
+ if ((free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+ break;
+ case 2:
+ if (free >= ac->ac_g_ex.fe_len)
+ return 1;
+ break;
+ case 3:
+ return 1;
+ default:
+ BUG();
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+{
+ ext4_group_t ngroups, group, i;
+ int cr;
+ int err = 0;
+ struct ext4_sb_info *sbi;
+ struct super_block *sb;
+ struct ext4_buddy e4b;
+
+ sb = ac->ac_sb;
+ sbi = EXT4_SB(sb);
+ ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
+ BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+ /* first, try the goal */
+ err = ext4_mb_find_by_goal(ac, &e4b);
+ if (err || ac->ac_status == AC_STATUS_FOUND)
+ goto out;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ goto out;
+
+ /*
+ * ac->ac2_order is set only if the fe_len is a power of 2
+ * if ac2_order is set we also set criteria to 0 so that we
+ * try exact allocation using buddy.
+ */
+ i = fls(ac->ac_g_ex.fe_len);
+ ac->ac_2order = 0;
+ /*
+ * We search using buddy data only if the order of the request
+ * is greater than equal to the sbi_s_mb_order2_reqs
+ * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+ */
+ if (i >= sbi->s_mb_order2_reqs) {
+ /*
+ * This should tell if fe_len is exactly power of 2
+ */
+ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+ ac->ac_2order = i - 1;
+ }
+
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+ /* TBD: may be hot point */
+ spin_lock(&sbi->s_md_lock);
+ ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+ ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+ spin_unlock(&sbi->s_md_lock);
+ }
+
+ /* Let's just scan groups to find more-less suitable blocks */
+ cr = ac->ac_2order ? 0 : 1;
+ /*
+ * cr == 0 try to get exact allocation,
+ * cr == 3 try to get anything
+ */
+repeat:
+ for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+ ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
+ for (i = 0; i < ngroups; group++, i++) {
+ if (group == ngroups)
+ group = 0;
+
+ /* This now checks without needing the buddy page */
+ if (!ext4_mb_good_group(ac, group, cr))
+ continue;
+
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err)
+ goto out;
+
+ ext4_lock_group(sb, group);
+
+ /*
+ * We need to check again after locking the
+ * block group
+ */
+ if (!ext4_mb_good_group(ac, group, cr)) {
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+ continue;
+ }
+
+ ac->ac_groups_scanned++;
+ if (cr == 0)
+ ext4_mb_simple_scan_group(ac, &e4b);
+ else if (cr == 1 && sbi->s_stripe &&
+ !(ac->ac_g_ex.fe_len % sbi->s_stripe))
+ ext4_mb_scan_aligned(ac, &e4b);
+ else
+ ext4_mb_complex_scan_group(ac, &e4b);
+
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
+ }
+ }
+
+ if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+ !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+ /*
+ * We've been searching too long. Let's try to allocate
+ * the best chunk we've found so far
+ */
+
+ ext4_mb_try_best_found(ac, &e4b);
+ if (ac->ac_status != AC_STATUS_FOUND) {
+ /*
+ * Someone more lucky has already allocated it.
+ * The only thing we can do is just take first
+ * found block(s)
+ printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
+ */
+ ac->ac_b_ex.fe_group = 0;
+ ac->ac_b_ex.fe_start = 0;
+ ac->ac_b_ex.fe_len = 0;
+ ac->ac_status = AC_STATUS_CONTINUE;
+ ac->ac_flags |= EXT4_MB_HINT_FIRST;
+ cr = 3;
+ atomic_inc(&sbi->s_mb_lost_chunks);
+ goto repeat;
+ }
+ }
+out:
+ return err;
+}
+
+static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
+{
+ struct super_block *sb = seq->private;
+ ext4_group_t group;
+
+ if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
+ return NULL;
+ group = *pos + 1;
+ return (void *) ((unsigned long) group);
+}
+
+static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct super_block *sb = seq->private;
+ ext4_group_t group;
+
+ ++*pos;
+ if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
+ return NULL;
+ group = *pos + 1;
+ return (void *) ((unsigned long) group);
+}
+
+static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
+{
+ struct super_block *sb = seq->private;
+ ext4_group_t group = (ext4_group_t) ((unsigned long) v);
+ int i;
+ int err;
+ struct ext4_buddy e4b;
+ struct sg {
+ struct ext4_group_info info;
+ ext4_grpblk_t counters[16];
+ } sg;
+
+ group--;
+ if (group == 0)
+ seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
+ "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
+ "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
+ "group", "free", "frags", "first",
+ "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
+ "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+
+ i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ sizeof(struct ext4_group_info);
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
+ ext4_lock_group(sb, group);
+ memcpy(&sg, ext4_get_group_info(sb, group), i);
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+ sg.info.bb_fragments, sg.info.bb_first_free);
+ for (i = 0; i <= 13; i++)
+ seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
+ sg.info.bb_counters[i] : 0);
+ seq_printf(seq, " ]\n");
+
+ return 0;
+}
+
+static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations ext4_mb_seq_groups_ops = {
+ .start = ext4_mb_seq_groups_start,
+ .next = ext4_mb_seq_groups_next,
+ .stop = ext4_mb_seq_groups_stop,
+ .show = ext4_mb_seq_groups_show,
+};
+
+static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
+{
+ struct super_block *sb = PDE(inode)->data;
+ int rc;
+
+ rc = seq_open(file, &ext4_mb_seq_groups_ops);
+ if (rc == 0) {
+ struct seq_file *m = file->private_data;
+ m->private = sb;
+ }
+ return rc;
+
+}
+
+static const struct file_operations ext4_mb_seq_groups_fops = {
+ .owner = THIS_MODULE,
+ .open = ext4_mb_seq_groups_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+ struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+ BUG_ON(!cachep);
+ return cachep;
+}
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+ struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't allocate mem "
+ "for a buddy group");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
+ goto exit_group_info;
+ }
+ memset(meta_group_info[i], 0, kmem_cache_size(cachep));
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_clusters_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ ext4_free_group_clusters(sb, desc);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
+ meta_group_info[i]->bb_free_root = RB_ROOT;
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ }
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+static int ext4_mb_init_backend(struct super_block *sb)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ ext4_group_t i;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
+ struct ext4_group_desc *desc;
+ struct kmem_cache *cachep;
+
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
+ /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
+ * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
+ * So a two level scheme suffices for now. */
+ sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
+ if (sbi->s_group_info == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ sbi->s_buddy_cache = new_inode(sb);
+ if (sbi->s_buddy_cache == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't get new inode");
+ goto err_freesgi;
+ }
+ /* To avoid potentially colliding with an valid on-disk inode number,
+ * use EXT4_BAD_INO for the buddy cache inode number. This inode is
+ * not in the inode hash, so it should never be found by iget(), but
+ * this will avoid confusion if it ever shows up during debugging. */
+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
+ EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+ for (i = 0; i < ngroups; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+ ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
+ goto err_freebuddy;
+ }
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
+ }
+
+ return 0;
+
+err_freebuddy:
+ cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ while (i-- > 0)
+ kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+ i = num_meta_group_infos;
+ while (i-- > 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
+err_freesgi:
+ ext4_kvfree(sbi->s_group_info);
+ return -ENOMEM;
+}
+
+static void ext4_groupinfo_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+ if (ext4_groupinfo_caches[i])
+ kmem_cache_destroy(ext4_groupinfo_caches[i]);
+ ext4_groupinfo_caches[i] = NULL;
+ }
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+ static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+ int slab_size;
+ int blocksize_bits = order_base_2(size);
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+ struct kmem_cache *cachep;
+
+ if (cache_index >= NR_GRPINFO_CACHES)
+ return -EINVAL;
+
+ if (unlikely(cache_index < 0))
+ cache_index = 0;
+
+ mutex_lock(&ext4_grpinfo_slab_create_mutex);
+ if (ext4_groupinfo_caches[cache_index]) {
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ return 0; /* Already created */
+ }
+
+ slab_size = offsetof(struct ext4_group_info,
+ bb_counters[blocksize_bits + 2]);
+
+ cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+ slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+ NULL);
+
+ ext4_groupinfo_caches[cache_index] = cachep;
+
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ if (!cachep) {
+ printk(KERN_EMERG
+ "EXT4-fs: no memory for groupinfo slab cache\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int ext4_mb_init(struct super_block *sb, int needs_recovery)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned i, j;
+ unsigned offset;
+ unsigned max;
+ int ret;
+
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+
+ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
+ if (sbi->s_mb_offsets == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
+ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
+ if (sbi->s_mb_maxs == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+ if (ret < 0)
+ goto out;
+
+ /* order 0 is regular bitmap */
+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+ sbi->s_mb_offsets[0] = 0;
+
+ i = 1;
+ offset = 0;
+ max = sb->s_blocksize << 2;
+ do {
+ sbi->s_mb_offsets[i] = offset;
+ sbi->s_mb_maxs[i] = max;
+ offset += 1 << (sb->s_blocksize_bits - i);
+ max = max >> 1;
+ i++;
+ } while (i <= sb->s_blocksize_bits + 1);
+
+ spin_lock_init(&sbi->s_md_lock);
+ spin_lock_init(&sbi->s_bal_lock);
+
+ sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
+ sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
+ sbi->s_mb_stats = MB_DEFAULT_STATS;
+ sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
+ sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+ * systems, this is probably too big (i.e, if the cluster size
+ * is 1 megabyte, then group preallocation size becomes half a
+ * gigabyte!). As a default, we will keep a two megabyte
+ * group pralloc size for cluster sizes up to 64k, and after
+ * that, we will force a minimum group preallocation size of
+ * 32 clusters. This translates to 8 megs when the cluster
+ * size is 256k, and 32 megs when the cluster size is 1 meg,
+ * which seems reasonable as a default.
+ */
+ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+ sbi->s_cluster_bits, 32);
+ /*
+ * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+ * to the lowest multiple of s_stripe which is bigger than
+ * the s_mb_group_prealloc as determined above. We want
+ * the preallocation size to be an exact multiple of the
+ * RAID stripe size so that preallocations don't fragment
+ * the stripes.
+ */
+ if (sbi->s_stripe > 1) {
+ sbi->s_mb_group_prealloc = roundup(
+ sbi->s_mb_group_prealloc, sbi->s_stripe);
+ }
+
+ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
+ if (sbi->s_locality_groups == NULL) {
+ ret = -ENOMEM;
+ goto out_free_groupinfo_slab;
+ }
+ for_each_possible_cpu(i) {
+ struct ext4_locality_group *lg;
+ lg = per_cpu_ptr(sbi->s_locality_groups, i);
+ mutex_init(&lg->lg_mutex);
+ for (j = 0; j < PREALLOC_TB_SIZE; j++)
+ INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
+ spin_lock_init(&lg->lg_prealloc_lock);
+ }
+
+ /* init file for buddy data */
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0)
+ goto out_free_locality_groups;
+
+ if (sbi->s_proc)
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
+
+ return 0;
+
+out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
+out_free_groupinfo_slab:
+ ext4_groupinfo_destroy_slabs();
+out:
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+ sbi->s_mb_maxs = NULL;
+ return ret;
+}
+
+/* need to called with the ext4 group lock held */
+static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+{
+ struct ext4_prealloc_space *pa;
+ struct list_head *cur, *tmp;
+ int count = 0;
+
+ list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
+ pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+ list_del(&pa->pa_group_list);
+ count++;
+ kmem_cache_free(ext4_pspace_cachep, pa);
+ }
+ if (count)
+ mb_debug(1, "mballoc: %u PAs left\n", count);
+
+}
+
+int ext4_mb_release(struct super_block *sb)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ ext4_group_t i;
+ int num_meta_group_infos;
+ struct ext4_group_info *grinfo;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
+
+ if (sbi->s_group_info) {
+ for (i = 0; i < ngroups; i++) {
+ grinfo = ext4_get_group_info(sb, i);
+#ifdef DOUBLE_CHECK
+ kfree(grinfo->bb_bitmap);
+#endif
+ ext4_lock_group(sb, i);
+ ext4_mb_cleanup_pa(grinfo);
+ ext4_unlock_group(sb, i);
+ kmem_cache_free(cachep, grinfo);
+ }
+ num_meta_group_infos = (ngroups +
+ EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ for (i = 0; i < num_meta_group_infos; i++)
+ kfree(sbi->s_group_info[i]);
+ ext4_kvfree(sbi->s_group_info);
+ }
+ kfree(sbi->s_mb_offsets);
+ kfree(sbi->s_mb_maxs);
+ if (sbi->s_buddy_cache)
+ iput(sbi->s_buddy_cache);
+ if (sbi->s_mb_stats) {
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u blocks %u reqs (%u success)",
+ atomic_read(&sbi->s_bal_allocated),
+ atomic_read(&sbi->s_bal_reqs),
+ atomic_read(&sbi->s_bal_success));
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u extents scanned, %u goal hits, "
+ "%u 2^N hits, %u breaks, %u lost",
+ atomic_read(&sbi->s_bal_ex_scanned),
+ atomic_read(&sbi->s_bal_goals),
+ atomic_read(&sbi->s_bal_2orders),
+ atomic_read(&sbi->s_bal_breaks),
+ atomic_read(&sbi->s_mb_lost_chunks));
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %lu generated and it took %Lu",
+ sbi->s_mb_buddies_generated,
+ sbi->s_mb_generation_time);
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u preallocated, %u discarded",
+ atomic_read(&sbi->s_mb_preallocated),
+ atomic_read(&sbi->s_mb_discarded));
+ }
+
+ free_percpu(sbi->s_locality_groups);
+
+ return 0;
+}
+
+static inline int ext4_issue_discard(struct super_block *sb,
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
+{
+ ext4_fsblk_t discard_block;
+
+ discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+ ext4_group_first_block_no(sb, block_group));
+ count = EXT4_C2B(EXT4_SB(sb), count);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long) discard_block, count);
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+}
+
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
+{
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
+ struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ int err, count = 0, count2 = 0;
+
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);
+
+ if (test_opt(sb, DISCARD))
+ ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster, entry->efd_count);
+
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
+
+
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
+
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);
+
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+ debugfs_dir = debugfs_create_dir("ext4", NULL);
+ if (debugfs_dir)
+ debugfs_debug = debugfs_create_u8("mballoc-debug",
+ S_IRUGO | S_IWUSR,
+ debugfs_dir,
+ &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+ debugfs_remove(debugfs_debug);
+ debugfs_remove(debugfs_dir);
+}
+
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
+int __init ext4_init_mballoc(void)
+{
+ ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_pspace_cachep == NULL)
+ return -ENOMEM;
+
+ ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_ac_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ return -ENOMEM;
+ }
+
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
+ ext4_create_debugfs_entry();
+ return 0;
+}
+
+void ext4_exit_mballoc(void)
+{
+ /*
+ * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+ * before destroying the slab cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_data_cachep);
+ ext4_groupinfo_destroy_slabs();
+ ext4_remove_debugfs_entry();
+}
+
+
+/*
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
+ * Returns 0 if success or error code
+ */
+static noinline_for_stack int
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+ handle_t *handle, unsigned int reserv_clstrs)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ struct ext4_sb_info *sbi;
+ struct super_block *sb;
+ ext4_fsblk_t block;
+ int err, len;
+
+ BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
+
+ sb = ac->ac_sb;
+ sbi = EXT4_SB(sb);
+
+ err = -EIO;
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ if (!bitmap_bh)
+ goto out_err;
+
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto out_err;
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
+ if (!gdp)
+ goto out_err;
+
+ ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
+ ext4_free_group_clusters(sb, gdp));
+
+ err = ext4_journal_get_write_access(handle, gdp_bh);
+ if (err)
+ goto out_err;
+
+ block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (!ext4_data_block_valid(sbi, block, len)) {
+ ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
+ "fs metadata", block, block+len);
+ /* File system mounted not to panic on error
+ * Fix the bitmap and repeat the block allocation
+ * We leak some of the blocks here.
+ */
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!err)
+ err = -EAGAIN;
+ goto out_err;
+ }
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+#ifdef AGGRESSIVE_CHECK
+ {
+ int i;
+ for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
+ BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
+ bitmap_bh->b_data));
+ }
+ }
+#endif
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
+ }
+ len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_group_clusters_set(sb, gdp, len);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+ percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
+ /*
+ * Now reduce the dirty block count also. Should not go negative
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ atomic_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (err)
+ goto out_err;
+ err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
+
+out_err:
+ ext4_mark_super_dirty(sb);
+ brelse(bitmap_bh);
+ return err;
+}
+
+/*
+ * here we normalize request for locality group
+ * Group request are normalized to s_mb_group_prealloc, which goes to
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
+ *
+ * XXX: should we try to preallocate more than the group has now?
+ */
+static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_locality_group *lg = ac->ac_lg;
+
+ BUG_ON(lg == NULL);
+ ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+ mb_debug(1, "#%u: goal %u blocks for locality group\n",
+ current->pid, ac->ac_g_ex.fe_len);
+}
+
+/*
+ * Normalization means making request better in terms of
+ * size and alignment
+ */
+static noinline_for_stack void
+ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+ struct ext4_allocation_request *ar)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int bsbits, max;
+ ext4_lblk_t end;
+ loff_t size, start_off;
+ loff_t orig_size __maybe_unused;
+ ext4_lblk_t start;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_prealloc_space *pa;
+
+ /* do normalize only data requests, metadata requests
+ do not need preallocation */
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return;
+
+ /* sometime caller may want exact blocks */
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
+ /* caller may indicate that preallocation isn't
+ * required (it's a tail, for example) */
+ if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
+ return;
+
+ if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
+ ext4_mb_normalize_group_request(ac);
+ return ;
+ }
+
+ bsbits = ac->ac_sb->s_blocksize_bits;
+
+ /* first, let's learn actual file size
+ * given current request is allocated */
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ size = size << bsbits;
+ if (size < i_size_read(ac->ac_inode))
+ size = i_size_read(ac->ac_inode);
+ orig_size = size;
+
+ /* max size of free chunks */
+ max = 2 << bsbits;
+
+#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
+ (req <= (size) || max <= (chunk_size))
+
+ /* first, try to predict filesize */
+ /* XXX: should this table be tunable? */
+ start_off = 0;
+ if (size <= 16 * 1024) {
+ size = 16 * 1024;
+ } else if (size <= 32 * 1024) {
+ size = 32 * 1024;
+ } else if (size <= 64 * 1024) {
+ size = 64 * 1024;
+ } else if (size <= 128 * 1024) {
+ size = 128 * 1024;
+ } else if (size <= 256 * 1024) {
+ size = 256 * 1024;
+ } else if (size <= 512 * 1024) {
+ size = 512 * 1024;
+ } else if (size <= 1024 * 1024) {
+ size = 1024 * 1024;
+ } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+ start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+ (21 - bsbits)) << 21;
+ size = 2 * 1024 * 1024;
+ } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+ start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+ (22 - bsbits)) << 22;
+ size = 4 * 1024 * 1024;
+ } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+ (8<<20)>>bsbits, max, 8 * 1024)) {
+ start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+ (23 - bsbits)) << 23;
+ size = 8 * 1024 * 1024;
+ } else {
+ start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+ size = ac->ac_o_ex.fe_len << bsbits;
+ }
+ size = size >> bsbits;
+ start = start_off >> bsbits;
+
+ /* don't cover already allocated blocks in selected range */
+ if (ar->pleft && start <= ar->lleft) {
+ size -= ar->lleft + 1 - start;
+ start = ar->lleft + 1;
+ }
+ if (ar->pright && start + size - 1 >= ar->lright)
+ size -= start + size - ar->lright;
+
+ end = start + size;
+
+ /* check we don't cross already preallocated blocks */
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ ext4_lblk_t pa_end;
+
+ if (pa->pa_deleted)
+ continue;
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
+
+ /* PA must not overlap original request */
+ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
+ ac->ac_o_ex.fe_logical < pa->pa_lstart));
+
+ /* skip PAs this normalized request doesn't overlap with */
+ if (pa->pa_lstart >= end || pa_end <= start) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+
+ /* adjust start or end to be adjacent to this pa */
+ if (pa_end <= ac->ac_o_ex.fe_logical) {
+ BUG_ON(pa_end < start);
+ start = pa_end;
+ } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ BUG_ON(pa->pa_lstart > end);
+ end = pa->pa_lstart;
+ }
+ spin_unlock(&pa->pa_lock);
+ }
+ rcu_read_unlock();
+ size = end - start;
+
+ /* XXX: extra loop to check we really don't overlap preallocations */
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ ext4_lblk_t pa_end;
+
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0) {
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
+ BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
+ }
+ spin_unlock(&pa->pa_lock);
+ }
+ rcu_read_unlock();
+
+ if (start + size <= ac->ac_o_ex.fe_logical &&
+ start > ac->ac_o_ex.fe_logical) {
+ ext4_msg(ac->ac_sb, KERN_ERR,
+ "start %lu, size %lu, fe_logical %lu",
+ (unsigned long) start, (unsigned long) size,
+ (unsigned long) ac->ac_o_ex.fe_logical);
+ }
+ BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
+ start > ac->ac_o_ex.fe_logical);
+ BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+
+ /* now prepare goal request */
+
+ /* XXX: is it better to align blocks WRT to logical
+ * placement or satisfy big request as is */
+ ac->ac_g_ex.fe_logical = start;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
+
+ /* define goal start in order to merge */
+ if (ar->pright && (ar->lright == (start + size))) {
+ /* merge to the right */
+ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
+ &ac->ac_f_ex.fe_group,
+ &ac->ac_f_ex.fe_start);
+ ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+ }
+ if (ar->pleft && (ar->lleft + 1 == start)) {
+ /* merge to the left */
+ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
+ &ac->ac_f_ex.fe_group,
+ &ac->ac_f_ex.fe_start);
+ ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
+ }
+
+ mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
+ (unsigned) orig_size, (unsigned) start);
+}
+
+static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+
+ if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
+ atomic_inc(&sbi->s_bal_reqs);
+ atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
+ if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
+ atomic_inc(&sbi->s_bal_success);
+ atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
+ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
+ atomic_inc(&sbi->s_bal_goals);
+ if (ac->ac_found > sbi->s_mb_max_to_scan)
+ atomic_inc(&sbi->s_bal_breaks);
+ }
+
+ if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+ trace_ext4_mballoc_alloc(ac);
+ else
+ trace_ext4_mballoc_prealloc(ac);
+}
+
+/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context. We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ int len;
+
+ if (pa && pa->pa_type == MB_INODE_PA) {
+ len = ac->ac_b_ex.fe_len;
+ pa->pa_free += len;
+ }
+
+}
+
+/*
+ * use blocks preallocated to inode
+ */
+static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_fsblk_t start;
+ ext4_fsblk_t end;
+ int len;
+
+ /* found preallocated blocks, use them */
+ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
+ ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
+ &ac->ac_b_ex.fe_start);
+ ac->ac_b_ex.fe_len = len;
+ ac->ac_status = AC_STATUS_FOUND;
+ ac->ac_pa = pa;
+
+ BUG_ON(start < pa->pa_pstart);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
+ BUG_ON(pa->pa_free < len);
+ pa->pa_free -= len;
+
+ mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
+}
+
+/*
+ * use blocks preallocated to locality group
+ */
+static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
+ struct ext4_prealloc_space *pa)
+{
+ unsigned int len = ac->ac_o_ex.fe_len;
+
+ ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+ &ac->ac_b_ex.fe_group,
+ &ac->ac_b_ex.fe_start);
+ ac->ac_b_ex.fe_len = len;
+ ac->ac_status = AC_STATUS_FOUND;
+ ac->ac_pa = pa;
+
+ /* we don't correct pa_pstart or pa_plen here to avoid
+ * possible race when the group is being loaded concurrently
+ * instead we correct pa later, after blocks are marked
+ * in on-disk bitmap -- see ext4_mb_release_context()
+ * Other CPUs are prevented from allocating from this pa by lg_mutex
+ */
+ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+ struct ext4_prealloc_space *pa,
+ struct ext4_prealloc_space *cpa)
+{
+ ext4_fsblk_t cur_distance, new_distance;
+
+ if (cpa == NULL) {
+ atomic_inc(&pa->pa_count);
+ return pa;
+ }
+ cur_distance = abs(goal_block - cpa->pa_pstart);
+ new_distance = abs(goal_block - pa->pa_pstart);
+
+ if (cur_distance <= new_distance)
+ return cpa;
+
+ /* drop the previous reference */
+ atomic_dec(&cpa->pa_count);
+ atomic_inc(&pa->pa_count);
+ return pa;
+}
+
+/*
+ * search goal blocks in preallocated space
+ */
+static noinline_for_stack int
+ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int order, i;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_locality_group *lg;
+ struct ext4_prealloc_space *pa, *cpa = NULL;
+ ext4_fsblk_t goal_block;
+
+ /* only data can be preallocated */
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return 0;
+
+ /* first, try per-file preallocation */
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+
+ /* all fields in this condition don't change,
+ * so we can skip locking for them */
+ if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
+ continue;
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
+ continue;
+
+ /* found preallocated blocks, use them */
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0 && pa->pa_free) {
+ atomic_inc(&pa->pa_count);
+ ext4_mb_use_inode_pa(ac, pa);
+ spin_unlock(&pa->pa_lock);
+ ac->ac_criteria = 10;
+ rcu_read_unlock();
+ return 1;
+ }
+ spin_unlock(&pa->pa_lock);
+ }
+ rcu_read_unlock();
+
+ /* can we use group allocation? */
+ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
+ return 0;
+
+ /* inode may have no locality group for some reason */
+ lg = ac->ac_lg;
+ if (lg == NULL)
+ return 0;
+ order = fls(ac->ac_o_ex.fe_len) - 1;
+ if (order > PREALLOC_TB_SIZE - 1)
+ /* The max size of hash table is PREALLOC_TB_SIZE */
+ order = PREALLOC_TB_SIZE - 1;
+
+ goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
+ /*
+ * search for the prealloc space that is having
+ * minimal distance from the goal block.
+ */
+ for (i = order; i < PREALLOC_TB_SIZE; i++) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+ pa_inode_list) {
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0 &&
+ pa->pa_free >= ac->ac_o_ex.fe_len) {
+
+ cpa = ext4_mb_check_group_pa(goal_block,
+ pa, cpa);
+ }
+ spin_unlock(&pa->pa_lock);
+ }
+ rcu_read_unlock();
+ }
+ if (cpa) {
+ ext4_mb_use_group_pa(ac, cpa);
+ ac->ac_criteria = 20;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with the ext4 group lock held
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+{
+ struct rb_node *n;
+ struct ext4_group_info *grp;
+ struct ext4_free_data *entry;
+
+ grp = ext4_get_group_info(sb, group);
+ n = rb_first(&(grp->bb_free_root));
+
+ while (n) {
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+ n = rb_next(n);
+ }
+ return;
+}
+
+/*
+ * the function goes through all preallocation in this group and marks them
+ * used in in-core bitmap. buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock held
+ */
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_prealloc_space *pa;
+ struct list_head *cur;
+ ext4_group_t groupnr;
+ ext4_grpblk_t start;
+ int preallocated = 0;
+ int len;
+
+ /* all form of preallocation discards first load group,
+ * so the only competing code is preallocation use.
+ * we don't need any locking here
+ * notice we do NOT ignore preallocations with pa_deleted
+ * otherwise we could leave used blocks available for
+ * allocation in buddy when concurrent ext4_mb_put_pa()
+ * is dropping preallocation
+ */
+ list_for_each(cur, &grp->bb_prealloc_list) {
+ pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+ spin_lock(&pa->pa_lock);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+ &groupnr, &start);
+ len = pa->pa_len;
+ spin_unlock(&pa->pa_lock);
+ if (unlikely(len == 0))
+ continue;
+ BUG_ON(groupnr != group);
+ ext4_set_bits(bitmap, start, len);
+ preallocated += len;
+ }
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
+}
+
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+ struct ext4_prealloc_space *pa;
+ pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+ kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
+/*
+ * drops a reference to preallocated space descriptor
+ * if this was the last reference and the space is consumed
+ */
+static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
+ struct super_block *sb, struct ext4_prealloc_space *pa)
+{
+ ext4_group_t grp;
+ ext4_fsblk_t grp_blk;
+
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
+ return;
+
+ /* in this short window concurrent discard can set pa_deleted */
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 1) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+ spin_unlock(&pa->pa_lock);
+
+ grp_blk = pa->pa_pstart;
+ /*
+ * If doing group-based preallocation, pa_pstart may be in the
+ * next group when pa is used up
+ */
+ if (pa->pa_type == MB_GROUP_PA)
+ grp_blk--;
+
+ ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+
+ /*
+ * possible race:
+ *
+ * P1 (buddy init) P2 (regular allocation)
+ * find block B in PA
+ * copy on-disk bitmap to buddy
+ * mark B in on-disk bitmap
+ * drop PA from group
+ * mark all PAs in buddy
+ *
+ * thus, P1 initializes buddy with B available. to prevent this
+ * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
+ * against that pair
+ */
+ ext4_lock_group(sb, grp);
+ list_del(&pa->pa_group_list);
+ ext4_unlock_group(sb, grp);
+
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+}
+
+/*
+ * creates new preallocated space for given inode
+ */
+static noinline_for_stack int
+ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_prealloc_space *pa;
+ struct ext4_group_info *grp;
+ struct ext4_inode_info *ei;
+
+ /* preallocate only when found space is larger then requested */
+ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+ BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+ BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+ pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+ if (pa == NULL)
+ return -ENOMEM;
+
+ if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+ int winl;
+ int wins;
+ int win;
+ int offs;
+
+ /* we can't allocate as much as normalizer wants.
+ * so, found space must get proper lstart
+ * to cover original request */
+ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
+ BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
+
+ /* we're limited by original request in that
+ * logical block must be covered any way
+ * winl is window we can move our chunk within */
+ winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+
+ /* also, we should cover whole original request */
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
+
+ /* the smallest one defines real window */
+ win = min(winl, wins);
+
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (offs && offs < win)
+ win = offs;
+
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_B2C(sbi, win);
+ BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
+ BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+ }
+
+ /* preallocation can change ac_b_ex, thus we store actually
+ * allocated blocks for history */
+ ac->ac_f_ex = ac->ac_b_ex;
+
+ pa->pa_lstart = ac->ac_b_ex.fe_logical;
+ pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ pa->pa_len = ac->ac_b_ex.fe_len;
+ pa->pa_free = pa->pa_len;
+ atomic_set(&pa->pa_count, 1);
+ spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
+ pa->pa_deleted = 0;
+ pa->pa_type = MB_INODE_PA;
+
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_ext4_mb_new_inode_pa(ac, pa);
+
+ ext4_mb_use_inode_pa(ac, pa);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
+
+ ei = EXT4_I(ac->ac_inode);
+ grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+
+ pa->pa_obj_lock = &ei->i_prealloc_lock;
+ pa->pa_inode = ac->ac_inode;
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+ spin_lock(pa->pa_obj_lock);
+ list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
+ spin_unlock(pa->pa_obj_lock);
+
+ return 0;
+}
+
+/*
+ * creates new preallocated space for locality group inodes belongs to
+ */
+static noinline_for_stack int
+ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_locality_group *lg;
+ struct ext4_prealloc_space *pa;
+ struct ext4_group_info *grp;
+
+ /* preallocate only when found space is larger then requested */
+ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+ BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+ BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+
+ BUG_ON(ext4_pspace_cachep == NULL);
+ pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+ if (pa == NULL)
+ return -ENOMEM;
+
+ /* preallocation can change ac_b_ex, thus we store actually
+ * allocated blocks for history */
+ ac->ac_f_ex = ac->ac_b_ex;
+
+ pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ pa->pa_lstart = pa->pa_pstart;
+ pa->pa_len = ac->ac_b_ex.fe_len;
+ pa->pa_free = pa->pa_len;
+ atomic_set(&pa->pa_count, 1);
+ spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_group_list);
+ pa->pa_deleted = 0;
+ pa->pa_type = MB_GROUP_PA;
+
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_ext4_mb_new_group_pa(ac, pa);
+
+ ext4_mb_use_group_pa(ac, pa);
+ atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+ grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ lg = ac->ac_lg;
+ BUG_ON(lg == NULL);
+
+ pa->pa_obj_lock = &lg->lg_prealloc_lock;
+ pa->pa_inode = NULL;
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+ /*
+ * We will later add the new pa to the right bucket
+ * after updating the pa_free in ext4_mb_release_context
+ */
+ return 0;
+}
+
+static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+{
+ int err;
+
+ if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+ err = ext4_mb_new_group_pa(ac);
+ else
+ err = ext4_mb_new_inode_pa(ac);
+ return err;
+}
+
+/*
+ * finds all unused blocks in on-disk bitmap, frees them in
+ * in-core bitmap and buddy.
+ * @pa must be unlinked from inode and group lists, so that
+ * nobody else can find/use it.
+ * the caller MUST hold group/inode locks.
+ * TODO: optimize the case when there are no in-core structures yet
+ */
+static noinline_for_stack int
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+ struct ext4_prealloc_space *pa)
+{
+ struct super_block *sb = e4b->bd_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned int end;
+ unsigned int next;
+ ext4_group_t group;
+ ext4_grpblk_t bit;
+ unsigned long long grp_blk_start;
+ int err = 0;
+ int free = 0;
+
+ BUG_ON(pa->pa_deleted == 0);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
+ BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ end = bit + pa->pa_len;
+
+ while (bit < end) {
+ bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+ if (bit >= end)
+ break;
+ next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
+ mb_debug(1, " free preallocated %u/%u in group %u\n",
+ (unsigned) ext4_group_first_block_no(sb, group) + bit,
+ (unsigned) next - bit, (unsigned) group);
+ free += next - bit;
+
+ trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
+ next - bit);
+ mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
+ bit = next + 1;
+ }
+ if (free != pa->pa_free) {
+ ext4_msg(e4b->bd_sb, KERN_CRIT,
+ "pa %p: logic %lu, phys. %lu, len %lu",
+ pa, (unsigned long) pa->pa_lstart,
+ (unsigned long) pa->pa_pstart,
+ (unsigned long) pa->pa_len);
+ ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
+ free, pa->pa_free);
+ /*
+ * pa is already deleted so we use the value obtained
+ * from the bitmap and continue.
+ */
+ }
+ atomic_add(free, &sbi->s_mb_discarded);
+
+ return err;
+}
+
+static noinline_for_stack int
+ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+ struct ext4_prealloc_space *pa)
+{
+ struct super_block *sb = e4b->bd_sb;
+ ext4_group_t group;
+ ext4_grpblk_t bit;
+
+ trace_ext4_mb_release_group_pa(sb, pa);
+ BUG_ON(pa->pa_deleted == 0);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+ BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+ atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+ trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
+
+ return 0;
+}
+
+/*
+ * releases all preallocations in given group
+ *
+ * first, we need to decide discard policy:
+ * - when do we discard
+ * 1) ENOSPC
+ * - how many do we discard
+ * 1) how many requested
+ */
+static noinline_for_stack int
+ext4_mb_discard_group_preallocations(struct super_block *sb,
+ ext4_group_t group, int needed)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_prealloc_space *pa, *tmp;
+ struct list_head list;
+ struct ext4_buddy e4b;
+ int err;
+ int busy = 0;
+ int free = 0;
+
+ mb_debug(1, "discard preallocation for group %u\n", group);
+
+ if (list_empty(&grp->bb_prealloc_list))
+ return 0;
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (bitmap_bh == NULL) {
+ ext4_error(sb, "Error reading block bitmap for %u", group);
+ return 0;
+ }
+
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ ext4_error(sb, "Error loading buddy information for %u", group);
+ put_bh(bitmap_bh);
+ return 0;
+ }
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+
+ INIT_LIST_HEAD(&list);
+repeat:
+ ext4_lock_group(sb, group);
+ list_for_each_entry_safe(pa, tmp,
+ &grp->bb_prealloc_list, pa_group_list) {
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ spin_unlock(&pa->pa_lock);
+ busy = 1;
+ continue;
+ }
+ if (pa->pa_deleted) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+
+ /* seems this one can be freed ... */
+ pa->pa_deleted = 1;
+
+ /* we can trust pa_free ... */
+ free += pa->pa_free;
+
+ spin_unlock(&pa->pa_lock);
+
+ list_del(&pa->pa_group_list);
+ list_add(&pa->u.pa_tmp_list, &list);
+ }
+
+ /* if we still need more blocks and some PAs were used, try again */
+ if (free < needed && busy) {
+ busy = 0;
+ ext4_unlock_group(sb, group);
+ /*
+ * Yield the CPU here so that we don't get soft lockup
+ * in non preempt case.
+ */
+ yield();
+ goto repeat;
+ }
+
+ /* found anything to free? */
+ if (list_empty(&list)) {
+ BUG_ON(free != 0);
+ goto out;
+ }
+
+ /* now free all selected PAs */
+ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+
+ /* remove from object (inode or locality group) */
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+
+ if (pa->pa_type == MB_GROUP_PA)
+ ext4_mb_release_group_pa(&e4b, pa);
+ else
+ ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+
+ list_del(&pa->u.pa_tmp_list);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+
+out:
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+ put_bh(bitmap_bh);
+ return free;
+}
+
+/*
+ * releases all non-used preallocated blocks for given inode
+ *
+ * It's important to discard preallocations under i_data_sem
+ * We don't want another block to be served from the prealloc
+ * space when we are discarding the inode prealloc space.
+ *
+ * FIXME!! Make sure it is valid at all the call sites
+ */
+void ext4_discard_preallocations(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_prealloc_space *pa, *tmp;
+ ext4_group_t group = 0;
+ struct list_head list;
+ struct ext4_buddy e4b;
+ int err;
+
+ if (!S_ISREG(inode->i_mode)) {
+ /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+ return;
+ }
+
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
+ trace_ext4_discard_preallocations(inode);
+
+ INIT_LIST_HEAD(&list);
+
+repeat:
+ /* first, collect all pa's in the inode */
+ spin_lock(&ei->i_prealloc_lock);
+ while (!list_empty(&ei->i_prealloc_list)) {
+ pa = list_entry(ei->i_prealloc_list.next,
+ struct ext4_prealloc_space, pa_inode_list);
+ BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ /* this shouldn't happen often - nobody should
+ * use preallocation while we're discarding it */
+ spin_unlock(&pa->pa_lock);
+ spin_unlock(&ei->i_prealloc_lock);
+ ext4_msg(sb, KERN_ERR,
+ "uh-oh! used pa while discarding");
+ WARN_ON(1);
+ schedule_timeout_uninterruptible(HZ);
+ goto repeat;
+
+ }
+ if (pa->pa_deleted == 0) {
+ pa->pa_deleted = 1;
+ spin_unlock(&pa->pa_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &list);
+ continue;
+ }
+
+ /* someone is deleting pa right now */
+ spin_unlock(&pa->pa_lock);
+ spin_unlock(&ei->i_prealloc_lock);
+
+ /* we have to wait here because pa_deleted
+ * doesn't mean pa is already unlinked from
+ * the list. as we might be called from
+ * ->clear_inode() the inode will get freed
+ * and concurrent thread which is unlinking
+ * pa from inode's list may access already
+ * freed memory, bad-bad-bad */
+
+ /* XXX: if this happens too often, we can
+ * add a flag to force wait only in case
+ * of ->clear_inode(), but not in case of
+ * regular truncate */
+ schedule_timeout_uninterruptible(HZ);
+ goto repeat;
+ }
+ spin_unlock(&ei->i_prealloc_lock);
+
+ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
+ BUG_ON(pa->pa_type != MB_INODE_PA);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
+ continue;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (bitmap_bh == NULL) {
+ ext4_error(sb, "Error reading block bitmap for %u",
+ group);
+ ext4_mb_unload_buddy(&e4b);
+ continue;
+ }
+
+ ext4_lock_group(sb, group);
+ list_del(&pa->pa_group_list);
+ ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
+ ext4_unlock_group(sb, group);
+
+ ext4_mb_unload_buddy(&e4b);
+ put_bh(bitmap_bh);
+
+ list_del(&pa->u.pa_tmp_list);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+ struct super_block *sb = ac->ac_sb;
+ ext4_group_t ngroups, i;
+
+ if (!mb_enable_debug ||
+ (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+ return;
+
+ ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ " Allocation context details:");
+ ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
+ ac->ac_status, ac->ac_flags);
+ ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
+ "best %lu/%lu/%lu@%lu cr %d",
+ (unsigned long)ac->ac_o_ex.fe_group,
+ (unsigned long)ac->ac_o_ex.fe_start,
+ (unsigned long)ac->ac_o_ex.fe_len,
+ (unsigned long)ac->ac_o_ex.fe_logical,
+ (unsigned long)ac->ac_g_ex.fe_group,
+ (unsigned long)ac->ac_g_ex.fe_start,
+ (unsigned long)ac->ac_g_ex.fe_len,
+ (unsigned long)ac->ac_g_ex.fe_logical,
+ (unsigned long)ac->ac_b_ex.fe_group,
+ (unsigned long)ac->ac_b_ex.fe_start,
+ (unsigned long)ac->ac_b_ex.fe_len,
+ (unsigned long)ac->ac_b_ex.fe_logical,
+ (int)ac->ac_criteria);
+ ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
+ ac->ac_ex_scanned, ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
+ ngroups = ext4_get_groups_count(sb);
+ for (i = 0; i < ngroups; i++) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+ struct ext4_prealloc_space *pa;
+ ext4_grpblk_t start;
+ struct list_head *cur;
+ ext4_lock_group(sb, i);
+ list_for_each(cur, &grp->bb_prealloc_list) {
+ pa = list_entry(cur, struct ext4_prealloc_space,
+ pa_group_list);
+ spin_lock(&pa->pa_lock);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+ NULL, &start);
+ spin_unlock(&pa->pa_lock);
+ printk(KERN_ERR "PA:%u:%d:%u \n", i,
+ start, pa->pa_len);
+ }
+ ext4_unlock_group(sb, i);
+
+ if (grp->bb_free == 0)
+ continue;
+ printk(KERN_ERR "%u: %d/%d \n",
+ i, grp->bb_free, grp->bb_fragments);
+ }
+ printk(KERN_ERR "\n");
+}
+#else
+static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
+{
+ return;
+}
+#endif
+
+/*
+ * We use locality group preallocation for small size file. The size of the
+ * file is determined by the current size or the resulting size after
+ * allocation which ever is larger
+ *
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
+ */
+static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ int bsbits = ac->ac_sb->s_blocksize_bits;
+ loff_t size, isize;
+
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
+
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+ return;
+ }
+
+ if (sbi->s_mb_group_prealloc <= 0) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+ if (size > sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
+ BUG_ON(ac->ac_lg != NULL);
+ /*
+ * locality group prealloc space are per cpu. The reason for having
+ * per cpu locality group is to reduce the contention between block
+ * request from multiple CPUs.
+ */
+ ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
+
+ /* we're going to use group allocation */
+ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
+
+ /* serialize all allocations in the group */
+ mutex_lock(&ac->ac_lg->lg_mutex);
+}
+
+static noinline_for_stack int
+ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+ struct ext4_allocation_request *ar)
+{
+ struct super_block *sb = ar->inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_group_t group;
+ unsigned int len;
+ ext4_fsblk_t goal;
+ ext4_grpblk_t block;
+
+ /* we can't allocate > group size */
+ len = ar->len;
+
+ /* just a dirty hack to filter too big requests */
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10)
+ len = EXT4_CLUSTERS_PER_GROUP(sb) - 10;
+
+ /* start searching from the goal */
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+ ext4_get_group_no_and_offset(sb, goal, &group, &block);
+
+ /* set up allocation goals */
+ memset(ac, 0, sizeof(struct ext4_allocation_context));
+ ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+ ac->ac_status = AC_STATUS_CONTINUE;
+ ac->ac_sb = sb;
+ ac->ac_inode = ar->inode;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
+ ac->ac_o_ex.fe_group = group;
+ ac->ac_o_ex.fe_start = block;
+ ac->ac_o_ex.fe_len = len;
+ ac->ac_g_ex = ac->ac_o_ex;
+ ac->ac_flags = ar->flags;
+
+ /* we have to define context: we'll we work with a file or
+ * locality group. this is a policy, actually */
+ ext4_mb_group_or_file(ac);
+
+ mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ "left: %u/%u, right %u/%u to %swritable\n",
+ (unsigned) ar->len, (unsigned) ar->logical,
+ (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
+ (unsigned) ar->lleft, (unsigned) ar->pleft,
+ (unsigned) ar->lright, (unsigned) ar->pright,
+ atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+ return 0;
+
+}
+
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+ struct ext4_locality_group *lg,
+ int order, int total_entries)
+{
+ ext4_group_t group = 0;
+ struct ext4_buddy e4b;
+ struct list_head discard_list;
+ struct ext4_prealloc_space *pa, *tmp;
+
+ mb_debug(1, "discard locality group preallocation\n");
+
+ INIT_LIST_HEAD(&discard_list);
+
+ spin_lock(&lg->lg_prealloc_lock);
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+ pa_inode_list) {
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ /*
+ * This is the pa that we just used
+ * for block allocation. So don't
+ * free that
+ */
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ if (pa->pa_deleted) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ /* only lg prealloc space */
+ BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+ /* seems this one can be freed ... */
+ pa->pa_deleted = 1;
+ spin_unlock(&pa->pa_lock);
+
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &discard_list);
+
+ total_entries--;
+ if (total_entries <= 5) {
+ /*
+ * we want to keep only 5 entries
+ * allowing it to grow to 8. This
+ * mak sure we don't call discard
+ * soon for this list.
+ */
+ break;
+ }
+ }
+ spin_unlock(&lg->lg_prealloc_lock);
+
+ list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ if (ext4_mb_load_buddy(sb, group, &e4b)) {
+ ext4_error(sb, "Error loading buddy information for %u",
+ group);
+ continue;
+ }
+ ext4_lock_group(sb, group);
+ list_del(&pa->pa_group_list);
+ ext4_mb_release_group_pa(&e4b, pa);
+ ext4_unlock_group(sb, group);
+
+ ext4_mb_unload_buddy(&e4b);
+ list_del(&pa->u.pa_tmp_list);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+ int order, added = 0, lg_prealloc_count = 1;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_locality_group *lg = ac->ac_lg;
+ struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+ order = fls(pa->pa_free) - 1;
+ if (order > PREALLOC_TB_SIZE - 1)
+ /* The max size of hash table is PREALLOC_TB_SIZE */
+ order = PREALLOC_TB_SIZE - 1;
+ /* Add the prealloc space to lg */
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+ pa_inode_list) {
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted) {
+ spin_unlock(&tmp_pa->pa_lock);
+ continue;
+ }
+ if (!added && pa->pa_free < tmp_pa->pa_free) {
+ /* Add to the tail of the previous entry */
+ list_add_tail_rcu(&pa->pa_inode_list,
+ &tmp_pa->pa_inode_list);
+ added = 1;
+ /*
+ * we want to count the total
+ * number of entries in the list
+ */
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ lg_prealloc_count++;
+ }
+ if (!added)
+ list_add_tail_rcu(&pa->pa_inode_list,
+ &lg->lg_prealloc_list[order]);
+ rcu_read_unlock();
+
+ /* Now trim the list to be not more than 8 elements */
+ if (lg_prealloc_count > 8) {
+ ext4_mb_discard_lg_preallocations(sb, lg,
+ order, lg_prealloc_count);
+ return;
+ }
+ return ;
+}
+
+/*
+ * release all resource we used in allocation
+ */
+static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ if (pa) {
+ if (pa->pa_type == MB_GROUP_PA) {
+ /* see comment in ext4_mb_use_group_pa() */
+ spin_lock(&pa->pa_lock);
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_free -= ac->ac_b_ex.fe_len;
+ pa->pa_len -= ac->ac_b_ex.fe_len;
+ spin_unlock(&pa->pa_lock);
+ }
+ }
+ if (pa) {
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
+ }
+ if (ac->ac_bitmap_page)
+ page_cache_release(ac->ac_bitmap_page);
+ if (ac->ac_buddy_page)
+ page_cache_release(ac->ac_buddy_page);
+ if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+ mutex_unlock(&ac->ac_lg->lg_mutex);
+ ext4_mb_collect_stats(ac);
+ return 0;
+}
+
+static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
+{
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ int ret;
+ int freed = 0;
+
+ trace_ext4_mb_discard_preallocations(sb, needed);
+ for (i = 0; i < ngroups && needed > 0; i++) {
+ ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ freed += ret;
+ needed -= ret;
+ }
+
+ return freed;
+}
+
+/*
+ * Main entry point into mballoc to allocate blocks
+ * it tries to use preallocation first, then falls back
+ * to usual allocation
+ */
+ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp)
+{
+ int freed;
+ struct ext4_allocation_context *ac = NULL;
+ struct ext4_sb_info *sbi;
+ struct super_block *sb;
+ ext4_fsblk_t block = 0;
+ unsigned int inquota = 0;
+ unsigned int reserv_clstrs = 0;
+
+ sb = ar->inode->i_sb;
+ sbi = EXT4_SB(sb);
+
+ trace_ext4_request_blocks(ar);
+
+ /* Allow to use superuser reservation for quota file */
+ if (IS_NOQUOTA(ar->inode))
+ ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
+ /*
+ * For delayed allocation, we could skip the ENOSPC and
+ * EDQUOT check, as blocks and quotas have been already
+ * reserved when data being copied into pagecache.
+ */
+ if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+ else {
+ /* Without delayed allocation we need to verify
+ * there is enough free blocks to do block allocation
+ * and verify allocation doesn't exceed the quota limits.
+ */
+ while (ar->len &&
+ ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
+
+ /* let others to free the space */
+ yield();
+ ar->len = ar->len >> 1;
+ }
+ if (!ar->len) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+ reserv_clstrs = ar->len;
+ if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
+ } else {
+ while (ar->len &&
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
+
+ ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+ ar->len--;
+ }
+ }
+ inquota = ar->len;
+ if (ar->len == 0) {
+ *errp = -EDQUOT;
+ goto out;
+ }
+ }
+
+ ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ if (!ac) {
+ ar->len = 0;
+ *errp = -ENOMEM;
+ goto out;
+ }
+
+ *errp = ext4_mb_initialize_context(ac, ar);
+ if (*errp) {
+ ar->len = 0;
+ goto out;
+ }
+
+ ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+ if (!ext4_mb_use_preallocated(ac)) {
+ ac->ac_op = EXT4_MB_HISTORY_ALLOC;
+ ext4_mb_normalize_request(ac, ar);
+repeat:
+ /* allocate space in core */
+ *errp = ext4_mb_regular_allocator(ac);
+ if (*errp)
+ goto errout;
+
+ /* as we've just preallocated more space than
+ * user requested orinally, we store allocated
+ * space in a special descriptor */
+ if (ac->ac_status == AC_STATUS_FOUND &&
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ ext4_mb_new_preallocation(ac);
+ }
+ if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+ if (*errp == -EAGAIN) {
+ /*
+ * drop the reference that we took
+ * in ext4_mb_use_best_found
+ */
+ ext4_mb_release_context(ac);
+ ac->ac_b_ex.fe_group = 0;
+ ac->ac_b_ex.fe_start = 0;
+ ac->ac_b_ex.fe_len = 0;
+ ac->ac_status = AC_STATUS_CONTINUE;
+ goto repeat;
+ } else if (*errp)
+ errout:
+ ext4_discard_allocated_blocks(ac);
+ else {
+ block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ ar->len = ac->ac_b_ex.fe_len;
+ }
+ } else {
+ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
+ if (freed)
+ goto repeat;
+ *errp = -ENOSPC;
+ }
+
+ if (*errp) {
+ ac->ac_b_ex.fe_len = 0;
+ ar->len = 0;
+ ext4_mb_show_ac(ac);
+ }
+ ext4_mb_release_context(ac);
+out:
+ if (ac)
+ kmem_cache_free(ext4_ac_cachep, ac);
+ if (inquota && ar->len < inquota)
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
+ if (!ar->len) {
+ if (!ext4_test_inode_state(ar->inode,
+ EXT4_STATE_DELALLOC_RESERVED))
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
+ }
+
+ trace_ext4_allocate_blocks(ar, (unsigned long long)block);
+
+ return block;
+}
+
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
+ return 1;
+ return 0;
+}
+
+static noinline_for_stack int
+ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+ struct ext4_free_data *new_entry)
+{
+ ext4_group_t group = e4b->bd_group;
+ ext4_grpblk_t cluster;
+ struct ext4_free_data *entry;
+ struct ext4_group_info *db = e4b->bd_info;
+ struct super_block *sb = e4b->bd_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+
+ BUG_ON(!ext4_handle_valid(handle));
+ BUG_ON(e4b->bd_bitmap_page == NULL);
+ BUG_ON(e4b->bd_buddy_page == NULL);
+
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;
+
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
+ n = &(*n)->rb_left;
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_grp_locked_error(sb, group, 0,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, cluster),
+ "Block already on to-be-freed list");
+ return 0;
+ }
+ }
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
+ rb_erase(node, &(db->bb_free_root));
+ ext4_journal_callback_del(handle, &entry->efd_jce);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ }
+ }
+
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->efd_count += entry->efd_count;
+ rb_erase(node, &(db->bb_free_root));
+ ext4_journal_callback_del(handle, &entry->efd_jce);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ }
+ }
+ /* Add the extent to transaction's private list */
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
+ return 0;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @block: start physical block to free
+ * @count: number of blocks to count
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_group_desc *gdp;
+ unsigned long freed = 0;
+ unsigned int overflow;
+ ext4_grpblk_t bit;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ struct ext4_sb_info *sbi;
+ struct ext4_buddy e4b;
+ unsigned int count_clusters;
+ int err = 0;
+ int ret;
+
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ sbi = EXT4_SB(sb);
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_data_block_valid(sbi, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ goto error_return;
+ }
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (flags & EXT4_FREE_BLOCKS_FORGET) {
+ struct buffer_head *tbh = bh;
+ int i;
+
+ BUG_ON(bh && (count > 1));
+
+ for (i = 0; i < count; i++) {
+ if (!bh)
+ tbh = sb_find_get_block(inode->i_sb,
+ block + i);
+ if (unlikely(!tbh))
+ continue;
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, tbh, block + i);
+ }
+ }
+
+ /*
+ * We need to make sure we don't reuse the freed block until
+ * after the transaction is committed, which we can do by
+ * treating the block as metadata, below. We make an
+ * exception if the inode is to be written in writeback mode
+ * since writeback mode has weak data consistency guarantees.
+ */
+ if (!ext4_should_writeback_data(inode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = block & (sbi->s_cluster_ratio - 1);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = count & (sbi->s_cluster_ratio - 1);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
+do_more:
+ overflow = 0;
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = EXT4_C2B(sbi, bit) + count -
+ EXT4_BLOCKS_PER_GROUP(sb);
+ count -= overflow;
+ }
+ count_clusters = EXT4_B2C(sbi, count);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto error_return;
+ }
+ gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!gdp) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
+ in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
+ in_range(block, ext4_inode_table(sb, gdp),
+ EXT4_SB(sb)->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, gdp),
+ EXT4_SB(sb)->s_itb_per_group)) {
+
+ ext4_error(sb, "Freeing blocks in system zone - "
+ "Block = %llu, count = %lu", block, count);
+ /* err = 0. ext4_std_error should be a no op */
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+#ifdef AGGRESSIVE_CHECK
+ {
+ int i;
+ for (i = 0; i < count_clusters; i++)
+ BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
+ }
+#endif
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+ struct ext4_free_data *new_entry;
+ /*
+ * blocks being freed are metadata. these blocks shouldn't
+ * be used until this transaction is committed
+ */
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
+ if (!new_entry) {
+ ext4_mb_unload_buddy(&e4b);
+ err = -ENOMEM;
+ goto error_return;
+ }
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;
+
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ ext4_mb_free_metadata(handle, &e4b, new_entry);
+ } else {
+ /* need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ mb_free_blocks(inode, &e4b, bit, count_clusters);
+ }
+
+ ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+ ext4_free_group_clusters_set(sb, gdp, ret);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic_add(count_clusters,
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ freed += count;
+
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+ if (overflow && !err) {
+ block += count;
+ count = overflow;
+ put_bh(bitmap_bh);
+ goto do_more;
+ }
+ ext4_mark_super_dirty(sb);
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return;
+}
+
+/**
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physcial block to add to the block group
+ * @count: number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ ext4_grpblk_t bit;
+ unsigned int i;
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+ if (count == 0)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ ext4_warning(sb, "too much blocks added to group %u\n",
+ block_group);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!desc) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group)) {
+ ext4_error(sb, "Adding blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ for (i = 0, blocks_freed = 0; i < count; i++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+ ext4_error(sb, "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ blocks_freed++;
+ }
+ }
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ /*
+ * need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_free_blocks(NULL, &e4b, bit, count);
+ blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_B2C(sbi, blocks_freed));
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic_add(EXT4_B2C(sbi, blocks_freed),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb: super block for the file system
+ * @start: starting block of the free extent in the alloc. group
+ * @count: number of blocks to TRIM
+ * @group: alloc. group we are working with
+ * @e4b: ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+ ext4_group_t group, struct ext4_buddy *e4b)
+{
+ struct ext4_free_extent ex;
+
+ trace_ext4_trim_extent(sb, group, start, count);
+
+ assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+ ex.fe_start = start;
+ ex.fe_group = group;
+ ex.fe_len = count;
+
+ /*
+ * Mark blocks used, so no one can reuse them while
+ * being trimmed.
+ */
+ mb_mark_used(e4b, &ex);
+ ext4_unlock_group(sb, group);
+ ext4_issue_discard(sb, group, start, count);
+ ext4_lock_group(sb, group);
+ mb_free_blocks(NULL, e4b, start, ex.fe_len);
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb: super block for file system
+ * @group: group to be trimmed
+ * @start: first group block to examine
+ * @max: last group block to examine
+ * @minblocks: minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
+{
+ void *bitmap;
+ ext4_grpblk_t next, count = 0, free_count = 0;
+ struct ext4_buddy e4b;
+ int ret;
+
+ trace_ext4_trim_all_free(sb, group, start, max);
+
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_error(sb, "Error in loading buddy "
+ "information for %u", group);
+ return ret;
+ }
+ bitmap = e4b.bd_bitmap;
+
+ ext4_lock_group(sb, group);
+ if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+ minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+ goto out;
+
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
+ break;
+ next = mb_find_next_bit(bitmap, max + 1, start);
+
+ if ((next - start) >= minblocks) {
+ ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
+ count += next - start;
+ }
+ free_count += next - start;
+ start = next + 1;
+
+ if (fatal_signal_pending(current)) {
+ count = -ERESTARTSYS;
+ break;
+ }
+
+ if (need_resched()) {
+ ext4_unlock_group(sb, group);
+ cond_resched();
+ ext4_lock_group(sb, group);
+ }
+
+ if ((e4b.bd_info->bb_free - free_count) < minblocks)
+ break;
+ }
+
+ if (!ret)
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+out:
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+
+ ext4_debug("trimmed %d blocks in the group %d\n",
+ count, group);
+
+ return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb: superblock for filesystem
+ * @range: fstrim_range structure
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ struct ext4_group_info *grp;
+ ext4_group_t group, first_group, last_group;
+ ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+ uint64_t start, end, minlen, trimmed = 0;
+ ext4_fsblk_t first_data_blk =
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+ int ret = 0;
+
+ start = range->start >> sb->s_blocksize_bits;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = range->minlen >> sb->s_blocksize_bits;
+
+ if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
+ unlikely(start >= max_blks))
+ return -EINVAL;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
+
+ /* Determine first and last group to examine based on start and end */
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+ &first_group, &first_cluster);
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+ &last_group, &last_cluster);
+
+ /* end now represents the last cluster to discard in this group */
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+ for (group = first_group; group <= last_group; group++) {
+ grp = ext4_get_group_info(sb, group);
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ break;
+ }
+
+ /*
+ * For all the groups except the last one, last cluster will
+ * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_cluster is
+ * already computed earlier by ext4_get_group_no_and_offset()
+ */
+ if (group == last_group)
+ end = last_cluster;
+
+ if (grp->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, group, first_cluster,
+ end, minlen);
+ if (cnt < 0) {
+ ret = cnt;
+ break;
+ }
+ trimmed += cnt;
+ }
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first cluster to discard will be cluster #0.
+ */
+ first_cluster = 0;
+ }
+
+ if (!ret)
+ atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+
+out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
new file mode 100644
index 00000000..c070618c
--- /dev/null
+++ b/fs/ext4/mballoc.h
@@ -0,0 +1,222 @@
+/*
+ * fs/ext4/mballoc.h
+ *
+ * Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...) \
+ do { \
+ if ((n) <= mb_enable_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(fmt, ## a); \
+ } \
+ } while (0)
+#else
+#define mb_debug(n, fmt, a...)
+#endif
+
+#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
+#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN 200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN 10
+
+/*
+ * How many groups mballoc will scan looking for the best chunk
+ */
+#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS 0
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS 2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC 512
+
+
+struct ext4_free_data {
+ /* MUST be the first member */
+ struct ext4_journal_cb_entry efd_jce;
+
+ /* ext4_free_data private data starts from here */
+
+ /* this links the free block information from group_info */
+ struct rb_node efd_node;
+
+ /* group which free block extent belongs */
+ ext4_group_t efd_group;
+
+ /* free block extent */
+ ext4_grpblk_t efd_start_cluster;
+ ext4_grpblk_t efd_count;
+
+ /* transaction which freed this extent */
+ tid_t efd_tid;
+};
+
+struct ext4_prealloc_space {
+ struct list_head pa_inode_list;
+ struct list_head pa_group_list;
+ union {
+ struct list_head pa_tmp_list;
+ struct rcu_head pa_rcu;
+ } u;
+ spinlock_t pa_lock;
+ atomic_t pa_count;
+ unsigned pa_deleted;
+ ext4_fsblk_t pa_pstart; /* phys. block */
+ ext4_lblk_t pa_lstart; /* log. block */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
+ unsigned short pa_type; /* pa type. inode or group */
+ spinlock_t *pa_obj_lock;
+ struct inode *pa_inode; /* hack, for history only */
+};
+
+enum {
+ MB_INODE_PA = 0,
+ MB_GROUP_PA = 1
+};
+
+struct ext4_free_extent {
+ ext4_lblk_t fe_logical;
+ ext4_grpblk_t fe_start; /* In cluster units */
+ ext4_group_t fe_group;
+ ext4_grpblk_t fe_len; /* In cluster units */
+};
+
+/*
+ * Locality group:
+ * we try to group all related changes together
+ * so that writeback can flush/allocate them together as well
+ * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ * (512). We store prealloc space into the hash based on the pa_free blocks
+ * order value.ie, fls(pa_free)-1;
+ */
+#define PREALLOC_TB_SIZE 10
+struct ext4_locality_group {
+ /* for allocator */
+ /* to serialize allocates */
+ struct mutex lg_mutex;
+ /* list of preallocations */
+ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
+ spinlock_t lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+ struct inode *ac_inode;
+ struct super_block *ac_sb;
+
+ /* original request */
+ struct ext4_free_extent ac_o_ex;
+
+ /* goal request (normalized ac_o_ex) */
+ struct ext4_free_extent ac_g_ex;
+
+ /* the best found extent */
+ struct ext4_free_extent ac_b_ex;
+
+ /* copy of the best found extent taken before preallocation efforts */
+ struct ext4_free_extent ac_f_ex;
+
+ /* number of iterations done. we have to track to limit searching */
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
+ __u16 ac_tail;
+ __u16 ac_buddy;
+ __u16 ac_flags; /* allocation hints */
+ __u8 ac_status;
+ __u8 ac_criteria;
+ __u8 ac_2order; /* if request is to allocate 2^N blocks and
+ * N > 0, the field stores N, otherwise 0 */
+ __u8 ac_op; /* operation, for history only */
+ struct page *ac_bitmap_page;
+ struct page *ac_buddy_page;
+ struct ext4_prealloc_space *ac_pa;
+ struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
+struct ext4_buddy {
+ struct page *bd_buddy_page;
+ void *bd_buddy;
+ struct page *bd_bitmap_page;
+ void *bd_bitmap;
+ struct ext4_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ ext4_group_t bd_group;
+};
+
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+ struct ext4_free_extent *fex)
+{
+ return ext4_group_first_block_no(sb, fex->fe_group) +
+ (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
+}
+#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
new file mode 100644
index 00000000..f39f80f8
--- /dev/null
+++ b/fs/ext4/migrate.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright IBM Corporation, 2007
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+
+/*
+ * The contiguous blocks details which can be
+ * represented by a single extent
+ */
+struct migrate_struct {
+ ext4_lblk_t first_block, last_block, curr_block;
+ ext4_fsblk_t first_pblock, last_pblock;
+};
+
+static int finish_range(handle_t *handle, struct inode *inode,
+ struct migrate_struct *lb)
+
+{
+ int retval = 0, needed;
+ struct ext4_extent newext;
+ struct ext4_ext_path *path;
+ if (lb->first_pblock == 0)
+ return 0;
+
+ /* Add the extent to temp inode*/
+ newext.ee_block = cpu_to_le32(lb->first_block);
+ newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
+ ext4_ext_store_pblock(&newext, lb->first_pblock);
+ path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+
+ if (IS_ERR(path)) {
+ retval = PTR_ERR(path);
+ path = NULL;
+ goto err_out;
+ }
+
+ /*
+ * Calculate the credit needed to inserting this extent
+ * Since we are doing this in loop we may accumalate extra
+ * credit. But below we try to not accumalate too much
+ * of them by restarting the journal.
+ */
+ needed = ext4_ext_calc_credits_for_single_extent(inode,
+ lb->last_block - lb->first_block + 1, path);
+
+ /*
+ * Make sure the credit we accumalated is not really high
+ */
+ if (needed && ext4_handle_has_enough_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS)) {
+ retval = ext4_journal_restart(handle, needed);
+ if (retval)
+ goto err_out;
+ } else if (needed) {
+ retval = ext4_journal_extend(handle, needed);
+ if (retval) {
+ /*
+ * IF not able to extend the journal restart the journal
+ */
+ retval = ext4_journal_restart(handle, needed);
+ if (retval)
+ goto err_out;
+ }
+ }
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+err_out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ lb->first_pblock = 0;
+ return retval;
+}
+
+static int update_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock, struct migrate_struct *lb)
+{
+ int retval;
+ /*
+ * See if we can add on to the existing range (if it exists)
+ */
+ if (lb->first_pblock &&
+ (lb->last_pblock+1 == pblock) &&
+ (lb->last_block+1 == lb->curr_block)) {
+ lb->last_pblock = pblock;
+ lb->last_block = lb->curr_block;
+ lb->curr_block++;
+ return 0;
+ }
+ /*
+ * Start a new range.
+ */
+ retval = finish_range(handle, inode, lb);
+ lb->first_pblock = lb->last_pblock = pblock;
+ lb->first_block = lb->last_block = lb->curr_block;
+ lb->curr_block++;
+ return retval;
+}
+
+static int update_ind_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
+{
+ struct buffer_head *bh;
+ __le32 *i_data;
+ int i, retval = 0;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = sb_bread(inode->i_sb, pblock);
+ if (!bh)
+ return -EIO;
+
+ i_data = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (i_data[i]) {
+ retval = update_extent_range(handle, inode,
+ le32_to_cpu(i_data[i]), lb);
+ if (retval)
+ break;
+ } else {
+ lb->curr_block++;
+ }
+ }
+ put_bh(bh);
+ return retval;
+
+}
+
+static int update_dind_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
+{
+ struct buffer_head *bh;
+ __le32 *i_data;
+ int i, retval = 0;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = sb_bread(inode->i_sb, pblock);
+ if (!bh)
+ return -EIO;
+
+ i_data = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (i_data[i]) {
+ retval = update_ind_extent_range(handle, inode,
+ le32_to_cpu(i_data[i]), lb);
+ if (retval)
+ break;
+ } else {
+ /* Only update the file block number */
+ lb->curr_block += max_entries;
+ }
+ }
+ put_bh(bh);
+ return retval;
+
+}
+
+static int update_tind_extent_range(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t pblock,
+ struct migrate_struct *lb)
+{
+ struct buffer_head *bh;
+ __le32 *i_data;
+ int i, retval = 0;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = sb_bread(inode->i_sb, pblock);
+ if (!bh)
+ return -EIO;
+
+ i_data = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (i_data[i]) {
+ retval = update_dind_extent_range(handle, inode,
+ le32_to_cpu(i_data[i]), lb);
+ if (retval)
+ break;
+ } else {
+ /* Only update the file block number */
+ lb->curr_block += max_entries * max_entries;
+ }
+ }
+ put_bh(bh);
+ return retval;
+
+}
+
+static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
+{
+ int retval = 0, needed;
+
+ if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+ return 0;
+ /*
+ * We are freeing a blocks. During this we touch
+ * superblock, group descriptor and block bitmap.
+ * So allocate a credit of 3. We may update
+ * quota (user and group).
+ */
+ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+ if (ext4_journal_extend(handle, needed) != 0)
+ retval = ext4_journal_restart(handle, needed);
+
+ return retval;
+}
+
+static int free_dind_blocks(handle_t *handle,
+ struct inode *inode, __le32 i_data)
+{
+ int i;
+ __le32 *tmp_idata;
+ struct buffer_head *bh;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+ if (!bh)
+ return -EIO;
+
+ tmp_idata = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (tmp_idata[i]) {
+ extend_credit_for_blkdel(handle, inode);
+ ext4_free_blocks(handle, inode, NULL,
+ le32_to_cpu(tmp_idata[i]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+ }
+ put_bh(bh);
+ extend_credit_for_blkdel(handle, inode);
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ return 0;
+}
+
+static int free_tind_blocks(handle_t *handle,
+ struct inode *inode, __le32 i_data)
+{
+ int i, retval = 0;
+ __le32 *tmp_idata;
+ struct buffer_head *bh;
+ unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+
+ bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
+ if (!bh)
+ return -EIO;
+
+ tmp_idata = (__le32 *)bh->b_data;
+ for (i = 0; i < max_entries; i++) {
+ if (tmp_idata[i]) {
+ retval = free_dind_blocks(handle,
+ inode, tmp_idata[i]);
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
+ }
+ }
+ put_bh(bh);
+ extend_credit_for_blkdel(handle, inode);
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ return 0;
+}
+
+static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
+{
+ int retval;
+
+ /* ei->i_data[EXT4_IND_BLOCK] */
+ if (i_data[0]) {
+ extend_credit_for_blkdel(handle, inode);
+ ext4_free_blocks(handle, inode, NULL,
+ le32_to_cpu(i_data[0]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ }
+
+ /* ei->i_data[EXT4_DIND_BLOCK] */
+ if (i_data[1]) {
+ retval = free_dind_blocks(handle, inode, i_data[1]);
+ if (retval)
+ return retval;
+ }
+
+ /* ei->i_data[EXT4_TIND_BLOCK] */
+ if (i_data[2]) {
+ retval = free_tind_blocks(handle, inode, i_data[2]);
+ if (retval)
+ return retval;
+ }
+ return 0;
+}
+
+static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
+ struct inode *tmp_inode)
+{
+ int retval;
+ __le32 i_data[3];
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
+
+ /*
+ * One credit accounted for writing the
+ * i_data field of the original inode
+ */
+ retval = ext4_journal_extend(handle, 1);
+ if (retval) {
+ retval = ext4_journal_restart(handle, 1);
+ if (retval)
+ goto err_out;
+ }
+
+ i_data[0] = ei->i_data[EXT4_IND_BLOCK];
+ i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
+ i_data[2] = ei->i_data[EXT4_TIND_BLOCK];
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ /*
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+ * happened after we started the migrate. We need to
+ * fail the migrate
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+ retval = -EAGAIN;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto err_out;
+ } else
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ /*
+ * We have the extent map build with the tmp inode.
+ * Now copy the i_data across
+ */
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
+
+ /*
+ * Update i_blocks with the new blocks that got
+ * allocated while adding extents for extent index
+ * blocks.
+ *
+ * While converting to extents we need not
+ * update the orignal inode i_blocks for extent blocks
+ * via quota APIs. The quota update happened via tmp_inode already.
+ */
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += tmp_inode->i_blocks;
+ spin_unlock(&inode->i_lock);
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+ /*
+ * We mark the inode dirty after, because we decrement the
+ * i_blocks when freeing the indirect meta-data blocks
+ */
+ retval = free_ind_block(handle, inode, i_data);
+ ext4_mark_inode_dirty(handle, inode);
+
+err_out:
+ return retval;
+}
+
+static int free_ext_idx(handle_t *handle, struct inode *inode,
+ struct ext4_extent_idx *ix)
+{
+ int i, retval = 0;
+ ext4_fsblk_t block;
+ struct buffer_head *bh;
+ struct ext4_extent_header *eh;
+
+ block = ext4_idx_pblock(ix);
+ bh = sb_bread(inode->i_sb, block);
+ if (!bh)
+ return -EIO;
+
+ eh = (struct ext4_extent_header *)bh->b_data;
+ if (eh->eh_depth != 0) {
+ ix = EXT_FIRST_INDEX(eh);
+ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+ retval = free_ext_idx(handle, inode, ix);
+ if (retval)
+ break;
+ }
+ }
+ put_bh(bh);
+ extend_credit_for_blkdel(handle, inode);
+ ext4_free_blocks(handle, inode, NULL, block, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+ return retval;
+}
+
+/*
+ * Free the extent meta data blocks only
+ */
+static int free_ext_block(handle_t *handle, struct inode *inode)
+{
+ int i, retval = 0;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data;
+ struct ext4_extent_idx *ix;
+ if (eh->eh_depth == 0)
+ /*
+ * No extra blocks allocated for extent meta data
+ */
+ return 0;
+ ix = EXT_FIRST_INDEX(eh);
+ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
+ retval = free_ext_idx(handle, inode, ix);
+ if (retval)
+ return retval;
+ }
+ return retval;
+
+}
+
+int ext4_ext_migrate(struct inode *inode)
+{
+ handle_t *handle;
+ int retval = 0, i;
+ __le32 *i_data;
+ struct ext4_inode_info *ei;
+ struct inode *tmp_inode = NULL;
+ struct migrate_struct lb;
+ unsigned long max_entries;
+ __u32 goal;
+ uid_t owner[2];
+
+ /*
+ * If the filesystem does not support extents, or the inode
+ * already is extent-based, error out.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
+
+ if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+ /*
+ * don't migrate fast symlink
+ */
+ return retval;
+
+ handle = ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ + 1);
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ return retval;
+ }
+ goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+ EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+ owner[0] = inode->i_uid;
+ owner[1] = inode->i_gid;
+ tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+ S_IFREG, NULL, goal, owner);
+ if (IS_ERR(tmp_inode)) {
+ retval = PTR_ERR(tmp_inode);
+ ext4_journal_stop(handle);
+ return retval;
+ }
+ i_size_write(tmp_inode, i_size_read(inode));
+ /*
+ * Set the i_nlink to zero so it will be deleted later
+ * when we drop inode reference.
+ */
+ clear_nlink(tmp_inode);
+
+ ext4_ext_tree_init(handle, tmp_inode);
+ ext4_orphan_add(handle, tmp_inode);
+ ext4_journal_stop(handle);
+
+ /*
+ * start with one credit accounted for
+ * superblock modification.
+ *
+ * For the tmp_inode we already have committed the
+ * trascation that created the inode. Later as and
+ * when we add extents we extent the journal
+ */
+ /*
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ /*
+ * It is impossible to update on-disk structures without
+ * a handle, so just rollback in-core changes and live other
+ * work to orphan_list_cleanup()
+ */
+ ext4_orphan_del(NULL, tmp_inode);
+ retval = PTR_ERR(handle);
+ goto out;
+ }
+
+ ei = EXT4_I(inode);
+ i_data = ei->i_data;
+ memset(&lb, 0, sizeof(lb));
+
+ /* 32 bit block address 4 bytes */
+ max_entries = inode->i_sb->s_blocksize >> 2;
+ for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
+ if (i_data[i]) {
+ retval = update_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[i]), &lb);
+ if (retval)
+ goto err_out;
+ } else
+ lb.curr_block++;
+ }
+ if (i_data[EXT4_IND_BLOCK]) {
+ retval = update_ind_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
+ if (retval)
+ goto err_out;
+ } else
+ lb.curr_block += max_entries;
+ if (i_data[EXT4_DIND_BLOCK]) {
+ retval = update_dind_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
+ if (retval)
+ goto err_out;
+ } else
+ lb.curr_block += max_entries * max_entries;
+ if (i_data[EXT4_TIND_BLOCK]) {
+ retval = update_tind_extent_range(handle, tmp_inode,
+ le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
+ if (retval)
+ goto err_out;
+ }
+ /*
+ * Build the last extent
+ */
+ retval = finish_range(handle, tmp_inode, &lb);
+err_out:
+ if (retval)
+ /*
+ * Failure case delete the extent information with the
+ * tmp_inode
+ */
+ free_ext_block(handle, tmp_inode);
+ else {
+ retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+ if (retval)
+ /*
+ * if we fail to swap inode data free the extent
+ * details of the tmp inode
+ */
+ free_ext_block(handle, tmp_inode);
+ }
+
+ /* We mark the tmp_inode dirty via ext4_ext_tree_init. */
+ if (ext4_journal_extend(handle, 1) != 0)
+ ext4_journal_restart(handle, 1);
+
+ /*
+ * Mark the tmp_inode as of size zero
+ */
+ i_size_write(tmp_inode, 0);
+
+ /*
+ * set the i_blocks count to zero
+ * so that the ext4_delete_inode does the
+ * right job
+ *
+ * We don't need to take the i_lock because
+ * the inode is not visible to user space.
+ */
+ tmp_inode->i_blocks = 0;
+
+ /* Reset the extent details */
+ ext4_ext_tree_init(handle, tmp_inode);
+ ext4_journal_stop(handle);
+out:
+ unlock_new_inode(tmp_inode);
+ iput(tmp_inode);
+
+ return retval;
+}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 00000000..ed6548d8
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,353 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+ mark_buffer_dirty(bh);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(WRITE_SYNC, bh);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+ ext4_fsblk_t mmp_block)
+{
+ struct mmp_struct *mmp;
+
+ if (*bh)
+ clear_buffer_uptodate(*bh);
+
+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
+ * that the MD RAID device cache has been bypassed, and that the read
+ * is not blocked in the elevator. */
+ if (!*bh)
+ *bh = sb_getblk(sb, mmp_block);
+ if (*bh) {
+ get_bh(*bh);
+ lock_buffer(*bh);
+ (*bh)->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_SYNC, *bh);
+ wait_on_buffer(*bh);
+ if (!buffer_uptodate(*bh)) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ }
+ if (!*bh) {
+ ext4_warning(sb, "Error while reading MMP block %llu",
+ mmp_block);
+ return -EIO;
+ }
+
+ mmp = (struct mmp_struct *)((*bh)->b_data);
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+ const char *function, unsigned int line, const char *msg)
+{
+ __ext4_warning(sb, function, line, msg);
+ __ext4_warning(sb, function, line,
+ "MMP failure info: last update time: %llu, last update "
+ "node: %s, last update device: %s\n",
+ (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+ mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+ struct super_block *sb = ((struct mmpd_data *) data)->sb;
+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct mmp_struct *mmp;
+ ext4_fsblk_t mmp_block;
+ u32 seq = 0;
+ unsigned long failed_writes = 0;
+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned mmp_check_interval;
+ unsigned long last_update_time;
+ unsigned long diff;
+ int retval;
+
+ mmp_block = le64_to_cpu(es->s_mmp_block);
+ mmp = (struct mmp_struct *)(bh->b_data);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ /*
+ * Start with the higher mmp_check_interval and reduce it if
+ * the MMP block is being updated on time.
+ */
+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+ memcpy(mmp->mmp_nodename, init_utsname()->nodename,
+ sizeof(mmp->mmp_nodename));
+
+ while (!kthread_should_stop()) {
+ if (++seq > EXT4_MMP_SEQ_MAX)
+ seq = 1;
+
+ mmp->mmp_seq = cpu_to_le32(seq);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ last_update_time = jiffies;
+
+ retval = write_mmp_block(bh);
+ /*
+ * Don't spew too many error messages. Print one every
+ * (s_mmp_update_interval * 60) seconds.
+ */
+ if (retval) {
+ if ((failed_writes % 60) == 0)
+ ext4_error(sb, "Error writing to MMP block");
+ failed_writes++;
+ }
+
+ if (!(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_MMP)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ if (sb->s_flags & MS_RDONLY) {
+ ext4_warning(sb, "kmmpd being stopped since filesystem "
+ "has been remounted as readonly.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ diff = jiffies - last_update_time;
+ if (diff < mmp_update_interval * HZ)
+ schedule_timeout_interruptible(mmp_update_interval *
+ HZ - diff);
+
+ /*
+ * We need to make sure that more than mmp_check_interval
+ * seconds have not passed since writing. If that has happened
+ * we need to check if the MMP block is as we left it.
+ */
+ diff = jiffies - last_update_time;
+ if (diff > mmp_check_interval * HZ) {
+ struct buffer_head *bh_check = NULL;
+ struct mmp_struct *mmp_check;
+
+ retval = read_mmp_block(sb, &bh_check, mmp_block);
+ if (retval) {
+ ext4_error(sb, "error reading MMP data: %d",
+ retval);
+
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+ sizeof(mmp->mmp_nodename))) {
+ dump_mmp_msg(sb, mmp_check,
+ "Error while updating MMP info. "
+ "The filesystem seems to have been"
+ " multiply mounted.");
+ ext4_error(sb, "abort");
+ goto failed;
+ }
+ put_bh(bh_check);
+ }
+
+ /*
+ * Adjust the mmp_check_interval depending on how much time
+ * it took for the MMP block to be written.
+ */
+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MAX_CHECK_INTERVAL),
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ }
+
+ /*
+ * Unmount seems to be clean.
+ */
+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+
+ retval = write_mmp_block(bh);
+
+failed:
+ kfree(data);
+ brelse(bh);
+ return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+ u32 new_seq;
+
+ do {
+ get_random_bytes(&new_seq, sizeof(u32));
+ } while (new_seq > EXT4_MMP_SEQ_MAX);
+
+ return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+ ext4_fsblk_t mmp_block)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = NULL;
+ struct mmp_struct *mmp = NULL;
+ struct mmpd_data *mmpd_data;
+ u32 seq;
+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned int wait_time = 0;
+ int retval;
+
+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+ mmp_block >= ext4_blocks_count(es)) {
+ ext4_warning(sb, "Invalid MMP block in superblock");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+
+ mmp = (struct mmp_struct *)(bh->b_data);
+
+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+ /*
+ * If check_interval in MMP block is larger, use that instead of
+ * update_interval from the superblock.
+ */
+ if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+ mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
+
+ seq = le32_to_cpu(mmp->mmp_seq);
+ if (seq == EXT4_MMP_SEQ_CLEAN)
+ goto skip;
+
+ if (seq == EXT4_MMP_SEQ_FSCK) {
+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ goto failed;
+ }
+
+ wait_time = min(mmp_check_interval * 2 + 1,
+ mmp_check_interval + 60);
+
+ /* Print MMP interval if more than 20 secs. */
+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+ ext4_warning(sb, "MMP interval %u higher than expected, please"
+ " wait.\n", wait_time * 2);
+
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+skip:
+ /*
+ * write a new random sequence number.
+ */
+ seq = mmp_new_seq();
+ mmp->mmp_seq = cpu_to_le32(seq);
+
+ retval = write_mmp_block(bh);
+ if (retval)
+ goto failed;
+
+ /*
+ * wait for MMP interval and check mmp_seq.
+ */
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+ if (!mmpd_data) {
+ ext4_warning(sb, "not enough memory for mmpd_data");
+ goto failed;
+ }
+ mmpd_data->sb = sb;
+ mmpd_data->bh = bh;
+
+ /*
+ * Start a kernel thread to update the MMP block periodically.
+ */
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ bdevname(bh->b_bdev,
+ mmp->mmp_bdevname));
+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ kfree(mmpd_data);
+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+ sb->s_id);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ brelse(bh);
+ return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 00000000..c5826c62
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ * Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **path)
+{
+ int ret = 0;
+
+ *path = ext4_ext_find_extent(inode, lblock, *path);
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+
+ return ret;
+}
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src: an extent for getting initialize status
+ * @dest: an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+ if (ext4_ext_is_uninitialized(src))
+ ext4_ext_mark_uninitialized(dest);
+ else
+ dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode: inode which is searched
+ * @path: this will obtain data for the next extent
+ * @extent: pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+{
+ struct ext4_extent_header *eh;
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
+ path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+ return 0;
+ }
+
+ while (--ppos >= 0) {
+ if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+ path[ppos].p_idx) {
+ int cur_ppos = ppos;
+
+ /* index block */
+ path[ppos].p_idx++;
+ path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+ if (path[ppos+1].p_bh)
+ brelse(path[ppos+1].p_bh);
+ path[ppos+1].p_bh =
+ sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!path[ppos+1].p_bh)
+ return -EIO;
+ path[ppos+1].p_hdr =
+ ext_block_hdr(path[ppos+1].p_bh);
+
+ /* Halfway index block */
+ while (++cur_ppos < leaf_ppos) {
+ path[cur_ppos].p_idx =
+ EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+ path[cur_ppos].p_block =
+ ext4_idx_pblock(path[cur_ppos].p_idx);
+ if (path[cur_ppos+1].p_bh)
+ brelse(path[cur_ppos+1].p_bh);
+ path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+ path[cur_ppos].p_block);
+ if (!path[cur_ppos+1].p_bh)
+ return -EIO;
+ path[cur_ppos+1].p_hdr =
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
+ path[leaf_ppos].p_ext = *extent = NULL;
+
+ eh = path[leaf_ppos].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) == 0)
+ /* empty leaf is found */
+ return -ENODATA;
+
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ path[leaf_ppos].p_block =
+ ext4_ext_pblock(path[leaf_ppos].p_ext);
+ return 0;
+ }
+ }
+ /* We found the last extent */
+ return 1;
+}
+
+/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+ const char *function, unsigned int line)
+{
+ int ret = 0;
+
+ if (inode1 == NULL) {
+ __ext4_error(inode2->i_sb, function, line,
+ "Both inodes should not be NULL: "
+ "inode1 NULL inode2 %lu", inode2->i_ino);
+ ret = -EIO;
+ } else if (inode2 == NULL) {
+ __ext4_error(inode1->i_sb, function, line,
+ "Both inodes should not be NULL: "
+ "inode1 %lu inode2 NULL", inode1->i_ino);
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/**
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ *
+ * @orig_inode: original inode structure
+ * @donor_inode: donor inode structure
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
+ */
+static void
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+{
+ struct inode *first = orig_inode, *second = donor_inode;
+
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+ * compare pointers that don't come from the same array.
+ */
+ if (donor_inode->i_ino < orig_inode->i_ino) {
+ first = donor_inode;
+ second = orig_inode;
+ }
+
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+}
+
+/**
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+static void
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+{
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ ext4_lblk_t eblock = 0;
+ int new_flag = 0;
+ int end_flag = 0;
+ int err = 0;
+
+ if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+ if (o_start == o_end) {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|-----------|--------|
+ * orig |------------------------------|
+ */
+ end_flag = 1;
+ } else {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|----------|---------|
+ * orig |---------------|--------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+ }
+
+ o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
+ new_flag = 1;
+
+ } else if (start_ext->ee_len && new_ext->ee_len &&
+ !end_ext->ee_len && o_start == o_end) {
+
+ /* start_ext new_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
+ new_flag = 1;
+
+ } else if (!start_ext->ee_len && new_ext->ee_len &&
+ end_ext->ee_len && o_start == o_end) {
+
+ /* new_ext end_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+
+ /*
+ * Set 0 to the extent block if new_ext was
+ * the first block.
+ */
+ if (new_ext->ee_block)
+ eblock = le32_to_cpu(new_ext->ee_block);
+
+ new_flag = 1;
+ } else {
+ ext4_debug("ext4 move extent: Unexpected insert case\n");
+ return -EIO;
+ }
+
+ if (new_flag) {
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, new_ext, 0))
+ goto out;
+ }
+
+ if (end_flag) {
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, end_ext, 0))
+ goto out;
+ }
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+
+ return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start: first original extent to be moved
+ * @o_end: last original extent to be moved
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ * @eh: extent header of target leaf block
+ * @range_to_move: used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+ struct ext4_extent *o_end,
+ struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext,
+ struct ext4_extent_header *eh,
+ int range_to_move)
+{
+ int i = 0;
+ unsigned long len;
+
+ /* Move the existing extents */
+ if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+ len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+ (unsigned long)(o_end + 1);
+ memmove(o_end + 1 + range_to_move, o_end + 1, len);
+ }
+
+ /* Insert start entry */
+ if (start_ext->ee_len)
+ o_start[i++].ee_len = start_ext->ee_len;
+
+ /* Insert new entry */
+ if (new_ext->ee_len) {
+ o_start[i] = *new_ext;
+ ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
+ }
+
+ /* Insert end entry */
+ if (end_ext->ee_len)
+ o_start[i] = *end_ext;
+
+ /* Increment the total entries counter on the extent block */
+ le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path,
+ struct ext4_extent *o_start,
+ struct ext4_extent *o_end,
+ struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_extent_header *eh;
+ unsigned long need_slots, slots_range;
+ int range_to_move, depth, ret;
+
+ /*
+ * The extents need to be inserted
+ * start_extent + new_extent + end_extent.
+ */
+ need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+ (new_ext->ee_len ? 1 : 0);
+
+ /* The number of slots between start and end */
+ slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+ / sizeof(struct ext4_extent);
+
+ /* Range to move the end of extent */
+ range_to_move = need_slots - slots_range;
+ depth = orig_path->p_depth;
+ orig_path += depth;
+ eh = orig_path->p_hdr;
+
+ if (depth) {
+ /* Register to journal */
+ ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+ if (ret)
+ return ret;
+ }
+
+ /* Expansion */
+ if (range_to_move > 0 &&
+ (range_to_move > le16_to_cpu(eh->eh_max)
+ - le16_to_cpu(eh->eh_entries))) {
+
+ ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+ o_end, start_ext, new_ext, end_ext);
+ if (ret < 0)
+ return ret;
+ } else
+ mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+ end_ext, eh, range_to_move);
+
+ if (depth) {
+ ret = ext4_handle_dirty_metadata(handle, orig_inode,
+ orig_path->p_bh);
+ if (ret)
+ return ret;
+ } else {
+ ret = ext4_mark_inode_dirty(handle, orig_inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @dext: donor extent
+ * @from: start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+ ext4_lblk_t *from)
+{
+ struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_lblk_t new_ext_end;
+ int oext_alen, new_ext_alen, end_ext_alen;
+ int depth = ext_depth(orig_inode);
+ int ret;
+
+ start_ext.ee_block = end_ext.ee_block = 0;
+ o_start = o_end = oext = orig_path[depth].p_ext;
+ oext_alen = ext4_ext_get_actual_len(oext);
+ start_ext.ee_len = end_ext.ee_len = 0;
+
+ new_ext.ee_block = cpu_to_le32(*from);
+ ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
+ new_ext.ee_len = dext->ee_len;
+ new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+ new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+
+ /*
+ * Case: original extent is first
+ * oext |--------|
+ * new_ext |--|
+ * start_ext |--|
+ */
+ if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+ le32_to_cpu(new_ext.ee_block) <
+ le32_to_cpu(oext->ee_block) + oext_alen) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+ le32_to_cpu(oext->ee_block));
+ start_ext.ee_block = oext->ee_block;
+ copy_extent_status(oext, &start_ext);
+ } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+ prev_ext = oext - 1;
+ /*
+ * We can merge new_ext into previous extent,
+ * if these are contiguous and same extent type.
+ */
+ if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+ &new_ext)) {
+ o_start = prev_ext;
+ start_ext.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(prev_ext) +
+ new_ext_alen);
+ start_ext.ee_block = oext->ee_block;
+ copy_extent_status(prev_ext, &start_ext);
+ new_ext.ee_len = 0;
+ }
+ }
+
+ /*
+ * Case: new_ext_end must be less than oext
+ * oext |-----------|
+ * new_ext |-------|
+ */
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ EXT4_ERROR_INODE(orig_inode,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
+
+ /*
+ * Case: new_ext is smaller than original extent
+ * oext |---------------|
+ * new_ext |-----------|
+ * end_ext |---|
+ */
+ if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+ new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+ end_ext.ee_len =
+ cpu_to_le16(le32_to_cpu(oext->ee_block) +
+ oext_alen - 1 - new_ext_end);
+ copy_extent_status(oext, &end_ext);
+ end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+ ext4_ext_store_pblock(&end_ext,
+ (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
+ end_ext.ee_block =
+ cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+ oext_alen - end_ext_alen);
+ }
+
+ ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+ o_end, &start_ext, &new_ext, &end_ext);
+out:
+ return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext: the extent that will belong to the original inode
+ * @tmp_oext: the extent that will belong to the donor inode
+ * @orig_off: block offset of original inode
+ * @donor_off: block offset of donor inode
+ * @max_count: the maximum length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+ struct ext4_extent *tmp_oext,
+ ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+ ext4_lblk_t max_count)
+{
+ ext4_lblk_t diff, orig_diff;
+ struct ext4_extent dext_old, oext_old;
+
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
+ dext_old = *tmp_dext;
+ oext_old = *tmp_oext;
+
+ /* When tmp_dext is too large, pick up the target range. */
+ diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+ ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
+ tmp_dext->ee_block =
+ cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+
+ if (max_count < ext4_ext_get_actual_len(tmp_dext))
+ tmp_dext->ee_len = cpu_to_le16(max_count);
+
+ orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+ ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
+
+ /* Adjust extent length if donor extent is larger than orig */
+ if (ext4_ext_get_actual_len(tmp_dext) >
+ ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+ orig_diff);
+
+ tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+ copy_extent_status(&oext_old, tmp_dext);
+ copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @from: block offset of orig_inode
+ * @count: block count to be replaced
+ * @err: pointer to save return value
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ * dummy extents.
+ * 2. Change the block information of original inode to point at the
+ * donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ * original inode blocks in the dummy extents.
+ *
+ * Return replaced block count.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+ struct inode *donor_inode, ext4_lblk_t from,
+ ext4_lblk_t count, int *err)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ struct ext4_ext_path *donor_path = NULL;
+ struct ext4_extent *oext, *dext;
+ struct ext4_extent tmp_dext, tmp_oext;
+ ext4_lblk_t orig_off = from, donor_off = from;
+ int depth;
+ int replaced_count = 0;
+ int dext_alen;
+
+ /* Protect extent trees against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
+
+ /* Get the original extent for the block "orig_off" */
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
+ goto out;
+
+ /* Get the donor extent for the head */
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ tmp_oext = *oext;
+
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count);
+ if (*err)
+ goto out;
+
+ /* Loop for the donor extents */
+ while (1) {
+ /* The extent for donor must be found. */
+ if (!dext) {
+ EXT4_ERROR_INODE(donor_inode,
+ "The extent for donor must be found");
+ *err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ EXT4_ERROR_INODE(donor_inode,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ *err = -EIO;
+ goto out;
+ }
+
+ /* Set donor extent to orig extent */
+ *err = mext_leaf_block(handle, orig_inode,
+ orig_path, &tmp_dext, &orig_off);
+ if (*err)
+ goto out;
+
+ /* Set orig extent to donor extent */
+ *err = mext_leaf_block(handle, donor_inode,
+ donor_path, &tmp_oext, &donor_off);
+ if (*err)
+ goto out;
+
+ dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+ replaced_count += dext_alen;
+ donor_off += dext_alen;
+ orig_off += dext_alen;
+
+ /* Already moved the expected blocks */
+ if (replaced_count >= count)
+ break;
+
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ tmp_oext = *oext;
+
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
+ goto out;
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (*err)
+ goto out;
+ }
+
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (donor_path) {
+ ext4_ext_drop_refs(donor_path);
+ kfree(donor_path);
+ }
+
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ double_up_write_data_sem(orig_inode, donor_inode);
+
+ return replaced_count;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp: file structure of original file
+ * @donor_inode: donor inode
+ * @orig_page_offset: page index on original file
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ * @uninit: orig extent is uninitialized or not
+ * @err: pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
+ */
+static int
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, int data_offset_in_page,
+ int block_len_in_page, int uninit, int *err)
+{
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct address_space *mapping = orig_inode->i_mapping;
+ struct buffer_head *bh;
+ struct page *page = NULL;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ handle_t *handle;
+ ext4_lblk_t orig_blk_offset;
+ long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+ unsigned int w_flags = 0;
+ unsigned int tmp_data_size, data_size, replaced_size;
+ void *fsdata;
+ int i, jblocks;
+ int err2 = 0;
+ int replaced_count = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and donor_inode may change each different metadata blocks.
+ */
+ jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ handle = ext4_journal_start(orig_inode, jblocks);
+ if (IS_ERR(handle)) {
+ *err = PTR_ERR(handle);
+ return 0;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+ w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ orig_blk_offset = orig_page_offset * blocks_per_page +
+ data_offset_in_page;
+
+ /*
+ * If orig extent is uninitialized one,
+ * it's not necessary force the page into memory
+ * and then force it to be written out again.
+ * Just swap data blocks between orig and donor.
+ */
+ if (uninit) {
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+ goto out2;
+ }
+
+ offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+ /* Calculate data_size */
+ if ((orig_blk_offset + block_len_in_page - 1) ==
+ ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+ /* Replace the last block */
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
+ /*
+ * If data_size equal zero, it shows data_size is multiples of
+ * blocksize. So we set appropriate value.
+ */
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
+
+ data_size = tmp_data_size +
+ ((block_len_in_page - 1) << orig_inode->i_blkbits);
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
+
+ replaced_size = data_size;
+
+ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
+ &page, &fsdata);
+ if (unlikely(*err < 0))
+ goto out;
+
+ if (!PageUptodate(page)) {
+ mapping->a_ops->readpage(o_filp, page);
+ lock_page(page);
+ }
+
+ /*
+ * try_to_release_page() doesn't call releasepage in writeback mode.
+ * We should care about the order of writing to the same file
+ * by multiple move extent processes.
+ * It needs to call wait_on_page_writeback() to wait for the
+ * writeback of the page.
+ */
+ wait_on_page_writeback(page);
+
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+ if (err2) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto out;
+ }
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+ bh = page_buffers(page);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+
+ for (i = 0; i < block_len_in_page; i++) {
+ *err = ext4_get_block(orig_inode,
+ (sector_t)(orig_blk_offset + i), bh, 0);
+ if (*err < 0)
+ goto out;
+
+ if (bh->b_this_page != NULL)
+ bh = bh->b_this_page;
+ }
+
+ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
+ page, fsdata);
+ page = NULL;
+
+out:
+ if (unlikely(page)) {
+ if (PageLocked(page))
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_journal_stop(handle);
+ }
+out2:
+ ext4_journal_stop(handle);
+
+ if (err2)
+ *err = err2;
+
+ return replaced_count;
+}
+
+/**
+ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @orig_start: logical start offset in block for orig
+ * @donor_start: logical start offset in block for donor
+ * @len: the number of blocks to be moved
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
+{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+ return -EPERM;
+
+ /* Ext4 move extent does not support swapfile */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be swapfile [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Files should be in the same ext4 FS */
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Ext4 move extent supports only extent based file */
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+ "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ return -EOPNOTSUPP;
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: donor file is not extents "
+ "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+ ext4_debug("ext4 move extent: File size is 0 byte\n");
+ return -EINVAL;
+ }
+
+ /* Start offset should be same */
+ if (orig_start != donor_start) {
+ ext4_debug("ext4 move extent: orig and donor's start "
+ "offset are not same [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if ((orig_start >= EXT_MAX_BLOCKS) ||
+ (donor_start >= EXT_MAX_BLOCKS) ||
+ (*len > EXT_MAX_BLOCKS) ||
+ (orig_start + *len >= EXT_MAX_BLOCKS)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_inode->i_size > donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
+ ext4_debug("ext4 move extent: orig start offset "
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
+ ext4_debug("ext4 move extent: End offset [%llu] should "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
+ "[ino:orig %lu, donor %lu]\n",
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ *len = donor_blocks - orig_start;
+ }
+ } else {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
+ ext4_debug("ext4 move extent: start offset [%llu] "
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_start + *len > orig_blocks) {
+ ext4_debug("ext4 move extent: Adjust length "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
+ "[ino:orig %lu, donor %lu]\n",
+ *len, orig_blocks - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ *len = orig_blocks - orig_start;
+ }
+ }
+
+ if (!*len) {
+ ext4_debug("ext4 move extent: len should not be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1 == inode2) {
+ mutex_lock(&inode1->i_mutex);
+ goto out;
+ }
+
+ if (inode1->i_ino < inode2->i_ino) {
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+ }
+
+out:
+ return ret;
+}
+
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1: the inode that is released first
+ * @inode2: the inode that is released second
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+
+static int
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1)
+ mutex_unlock(&inode1->i_mutex);
+
+ if (inode2 && inode2 != inode1)
+ mutex_unlock(&inode2->i_mutex);
+
+out:
+ return ret;
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp: file structure of the original file
+ * @d_filp: file structure of the donor file
+ * @orig_start: start offset in block for orig
+ * @donor_start: start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ * function by the start block number (orig_start) and the number of blocks
+ * to be moved (len) specified as arguments.
+ * If the {orig, donor}_start points a hole, the extent's start offset
+ * pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ * after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ * or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ * specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ * until find un-continuous extent, the start logical block number exceeds
+ * the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ * from orig_page_offset to seq_end_page.
+ * The start indexes of data are specified as arguments.
+ * That of the original inode is orig_page_offset,
+ * and the donor inode is also orig_page_offset
+ * (To easily handle blocksize != pagesize case, the offset for the
+ * donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ * then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ * which shows the number of moved blocks.
+ * The moved_len is useful for the command to calculate the file offset
+ * for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ __u64 orig_start, __u64 donor_start, __u64 len,
+ __u64 *moved_len)
+{
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct inode *donor_inode = d_filp->f_dentry->d_inode;
+ struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+ ext4_lblk_t block_start = orig_start;
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t rest_blocks;
+ pgoff_t orig_page_offset = 0, seq_end_page;
+ int ret1, ret2, depth, last_extent = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int data_offset_in_page;
+ int block_len_in_page;
+ int uninit;
+
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Protect orig and donor inodes against a truncate */
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
+
+ /* Protect extent tree against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len);
+ if (ret1)
+ goto out;
+
+ file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+ block_end = block_start + len - 1;
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret1)
+ goto out;
+
+ /* Get path structure to check the hole */
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret1)
+ goto out;
+
+ depth = ext_depth(orig_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+
+ /*
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
+ last_extent = mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret1 = last_extent;
+ goto out;
+ }
+ last_extent = mext_next_extent(orig_inode, orig_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+ ret1 = last_extent;
+ goto out;
+ }
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ ext4_debug("ext4 move extent: The specified range of file "
+ "may be the hole\n");
+ ret1 = -EINVAL;
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = mext_next_extent(orig_inode, holecheck_path,
+ &ext_cur);
+ if (last_extent < 0) {
+ ret1 = last_extent;
+ break;
+ }
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if (ext4_can_extents_be_merged(orig_inode,
+ ext_prev, ext_cur) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent)
+ continue;
+
+ /* Is original extent is uninitialized */
+ uninit = ext4_ext_is_uninitialized(ext_prev);
+
+ data_offset_in_page = seq_start % blocks_per_page;
+
+ /*
+ * Calculate data blocks count that should be swapped
+ * at the first page.
+ */
+ if (data_offset_in_page + seq_blocks > blocks_per_page) {
+ /* Swapped blocks are across pages */
+ block_len_in_page =
+ blocks_per_page - data_offset_in_page;
+ } else {
+ /* Swapped blocks are in a page */
+ block_len_in_page = seq_blocks;
+ }
+
+ orig_page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->readpage, ->write_begin, and ext4_get_block
+ * in move_extent_per_page
+ */
+ double_up_write_data_sem(orig_inode, donor_inode);
+
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+ block_len_in_page = move_extent_per_page(
+ o_filp, donor_inode,
+ orig_page_offset,
+ data_offset_in_page,
+ block_len_in_page, uninit,
+ &ret1);
+
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
+ if (ret1 < 0)
+ break;
+ if (*moved_len > len) {
+ EXT4_ERROR_INODE(orig_inode,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+ break;
+ }
+
+ orig_page_offset++;
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+ block_len_in_page = blocks_per_page;
+ else
+ block_len_in_page = rest_blocks;
+ }
+
+ double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret1 < 0)
+ break;
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret1)
+ break;
+ depth = holecheck_path->p_depth;
+
+ /* Decrease buffer counter */
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret1)
+ break;
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+ seq_blocks = 0;
+
+ }
+out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+ double_up_write_data_sem(orig_inode, donor_inode);
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
+
+ if (ret1)
+ return ret1;
+ else if (ret2)
+ return ret2;
+
+ return 0;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 00000000..0a94cbbe
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2607 @@
+/*
+ * linux/fs/ext4/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/namei.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ * Hash Tree Directory indexing (c)
+ * Daniel Phillips, 2001
+ * Hash Tree Directory indexing porting
+ * Christopher Li, 2002
+ * Hash Tree Directory indexing cleanup
+ * Theodore Ts'o, 2002
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd2.h>
+#include <linux/time.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+#include <trace/events/ext4.h>
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS 2
+#define NAMEI_RA_BLOCKS 4
+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static struct buffer_head *ext4_append(handle_t *handle,
+ struct inode *inode,
+ ext4_lblk_t *block, int *err)
+{
+ struct buffer_head *bh;
+
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+ bh = ext4_bread(handle, inode, *block, 1, err);
+ if (bh) {
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ *err = ext4_journal_get_write_access(handle, bh);
+ if (*err) {
+ brelse(bh);
+ bh = NULL;
+ }
+ }
+ return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+ __le32 inode;
+ __le16 rec_len;
+ u8 name_len;
+ u8 file_type;
+};
+
+struct dx_countlimit
+{
+ __le16 limit;
+ __le16 count;
+};
+
+struct dx_entry
+{
+ __le32 hash;
+ __le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero. Therefore, the
+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+ struct fake_dirent dot;
+ char dot_name[4];
+ struct fake_dirent dotdot;
+ char dotdot_name[4];
+ struct dx_root_info
+ {
+ __le32 reserved_zero;
+ u8 hash_version;
+ u8 info_length; /* 8 */
+ u8 indirect_levels;
+ u8 unused_flags;
+ }
+ info;
+ struct dx_entry entries[0];
+};
+
+struct dx_node
+{
+ struct fake_dirent fake;
+ struct dx_entry entries[0];
+};
+
+
+struct dx_frame
+{
+ struct buffer_head *bh;
+ struct dx_entry *entries;
+ struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+ u32 hash;
+ u16 offs;
+ u16 size;
+};
+
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
+static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct dx_frame *frame,
+ int *err);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
+ struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
+static void dx_insert_block(struct dx_frame *frame,
+ u32 hash, ext4_lblk_t block);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *err);
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len, blocksize));
+}
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+{
+ return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+{
+ entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash(struct dx_entry *entry)
+{
+ return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
+{
+ entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count(struct dx_entry *entries)
+{
+ return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit(struct dx_entry *entries)
+{
+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
+{
+ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
+{
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+{
+ unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+ EXT4_DIR_REC_LEN(2) - infosize;
+ return entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit(struct inode *dir)
+{
+ unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+ return entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index(char * label, struct dx_entry *entries)
+{
+ int i, n = dx_get_count (entries);
+ printk(KERN_DEBUG "%s index ", label);
+ for (i = 0; i < n; i++) {
+ printk("%x->%lu ", i ? dx_get_hash(entries + i) :
+ 0, (unsigned long)dx_get_block(entries + i));
+ }
+ printk("\n");
+}
+
+struct stats
+{
+ unsigned names;
+ unsigned space;
+ unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
+ int size, int show_names)
+{
+ unsigned names = 0, space = 0;
+ char *base = (char *) de;
+ struct dx_hash_info h = *hinfo;
+
+ printk("names: ");
+ while ((char *) de < base + size)
+ {
+ if (de->inode)
+ {
+ if (show_names)
+ {
+ int len = de->name_len;
+ char *name = de->name;
+ while (len--) printk("%c", *name++);
+ ext4fs_dirhash(de->name, de->name_len, &h);
+ printk(":%x.%u ", h.hash,
+ (unsigned) ((char *) de - base));
+ }
+ space += EXT4_DIR_REC_LEN(de->name_len);
+ names++;
+ }
+ de = ext4_next_entry(de, size);
+ }
+ printk("(%i)\n", names);
+ return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+ struct dx_entry *entries, int levels)
+{
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count = dx_get_count(entries), names = 0, space = 0, i;
+ unsigned bcount = 0;
+ struct buffer_head *bh;
+ int err;
+ printk("%i indexed blocks...\n", count);
+ for (i = 0; i < count; i++, entries++)
+ {
+ ext4_lblk_t block = dx_get_block(entries);
+ ext4_lblk_t hash = i ? dx_get_hash(entries): 0;
+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+ struct stats stats;
+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
+ if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+ stats = levels?
+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+ dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+ names += stats.names;
+ space += stats.space;
+ bcount += stats.bcount;
+ brelse(bh);
+ }
+ if (bcount)
+ printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+ levels ? "" : " ", names, space/bcount,
+ (space/bcount)*100/blocksize);
+ return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally. The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(const struct qstr *d_name, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+ unsigned count, indirect;
+ struct dx_entry *at, *entries, *p, *q, *m;
+ struct dx_root *root;
+ struct buffer_head *bh;
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+
+ frame->bh = NULL;
+ if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+ root = (struct dx_root *) bh->b_data;
+ if (root->info.hash_version != DX_HASH_TEA &&
+ root->info.hash_version != DX_HASH_HALF_MD4 &&
+ root->info.hash_version != DX_HASH_LEGACY) {
+ ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
+ root->info.hash_version);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+ hinfo->hash_version = root->info.hash_version;
+ if (hinfo->hash_version <= DX_HASH_TEA)
+ hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ if (d_name)
+ ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+ hash = hinfo->hash;
+
+ if (root->info.unused_flags & 1) {
+ ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
+ root->info.unused_flags);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+ if ((indirect = root->info.indirect_levels) > 1) {
+ ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
+ root->info.indirect_levels);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+ entries = (struct dx_entry *) (((char *)&root->info) +
+ root->info.info_length);
+
+ if (dx_get_limit(entries) != dx_root_limit(dir,
+ root->info.info_length)) {
+ ext4_warning(dir->i_sb, "dx entry: limit != root limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+ dxtrace(printk("Look up %x", hash));
+ while (1)
+ {
+ count = dx_get_count(entries);
+ if (!count || count > dx_get_limit(entries)) {
+ ext4_warning(dir->i_sb,
+ "dx entry: no count or count > limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail2;
+ }
+
+ p = entries + 1;
+ q = entries + count - 1;
+ while (p <= q)
+ {
+ m = p + (q - p)/2;
+ dxtrace(printk("."));
+ if (dx_get_hash(m) > hash)
+ q = m - 1;
+ else
+ p = m + 1;
+ }
+
+ if (0) // linear search cross check
+ {
+ unsigned n = count - 1;
+ at = entries;
+ while (n--)
+ {
+ dxtrace(printk(","));
+ if (dx_get_hash(++at) > hash)
+ {
+ at--;
+ break;
+ }
+ }
+ assert (at == p - 1);
+ }
+
+ at = p - 1;
+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+ if (!indirect--) return frame;
+ if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+ goto fail2;
+ at = entries = ((struct dx_node *) bh->b_data)->entries;
+ if (dx_get_limit(entries) != dx_node_limit (dir)) {
+ ext4_warning(dir->i_sb,
+ "dx entry: limit != node limit");
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail2;
+ }
+ frame++;
+ frame->bh = NULL;
+ }
+fail2:
+ while (frame >= frame_in) {
+ brelse(frame->bh);
+ frame--;
+ }
+fail:
+ if (*err == ERR_BAD_DX_DIR)
+ ext4_warning(dir->i_sb,
+ "Corrupt dir inode %lu, running e2fsck is "
+ "recommended.", dir->i_ino);
+ return NULL;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+ if (frames[0].bh == NULL)
+ return;
+
+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+ brelse(frames[1].bh);
+ brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary. Whether or not the search is necessary is
+ * controlled by the hash parameter. If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value. This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not. If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash)
+{
+ struct dx_frame *p;
+ struct buffer_head *bh;
+ int err, num_frames = 0;
+ __u32 bhash;
+
+ p = frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+ * If we run out of entries in the interior node, loop around and
+ * increment pointer in the parent node. When we break out of
+ * this loop, num_frames indicates the number of interior
+ * nodes need to be read.
+ */
+ while (1) {
+ if (++(p->at) < p->entries + dx_get_count(p->entries))
+ break;
+ if (p == frames)
+ return 0;
+ num_frames++;
+ p--;
+ }
+
+ /*
+ * If the hash is 1, then continue only if the next page has a
+ * continuation hash of any value. This is used for readdir
+ * handling. Otherwise, check to see if the hash matches the
+ * desired contiuation hash. If it doesn't, return since
+ * there's no point to read in the successive index pages.
+ */
+ bhash = dx_get_hash(p->at);
+ if (start_hash)
+ *start_hash = bhash;
+ if ((hash & 1) == 0) {
+ if ((bhash & ~1) != hash)
+ return 0;
+ }
+ /*
+ * If the hash is HASH_NB_ALWAYS, we always go to the next
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+ if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
+ 0, &err)))
+ return err; /* Failure */
+ p++;
+ brelse(p->bh);
+ p->bh = bh;
+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+ }
+ return 1;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block. It returns the number directory entries loaded
+ * into the tree. If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash)
+{
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de, *top;
+ int err, count = 0;
+
+ dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
+ (unsigned long)block));
+ if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+ return err;
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ top = (struct ext4_dir_entry_2 *) ((char *) de +
+ dir->i_sb->s_blocksize -
+ EXT4_DIR_REC_LEN(0));
+ for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ + ((char *)de - bh->b_data))) {
+ /* On error, skip the f_pos to the next block. */
+ dir_file->f_pos = (dir_file->f_pos |
+ (dir->i_sb->s_blocksize - 1)) + 1;
+ brelse(bh);
+ return count;
+ }
+ ext4fs_dirhash(de->name, de->name_len, hinfo);
+ if ((hinfo->hash < start_hash) ||
+ ((hinfo->hash == start_hash) &&
+ (hinfo->minor_hash < start_minor_hash)))
+ continue;
+ if (de->inode == 0)
+ continue;
+ if ((err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de)) != 0) {
+ brelse(bh);
+ return err;
+ }
+ count++;
+ }
+ brelse(bh);
+ return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory. We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash)
+{
+ struct dx_hash_info hinfo;
+ struct ext4_dir_entry_2 *de;
+ struct dx_frame frames[2], *frame;
+ struct inode *dir;
+ ext4_lblk_t block;
+ int count = 0;
+ int ret, err;
+ __u32 hashval;
+
+ dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+ start_hash, start_minor_hash));
+ dir = dir_file->f_path.dentry->d_inode;
+ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+ EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+ start_hash, start_minor_hash);
+ *next_hash = ~0;
+ return count;
+ }
+ hinfo.hash = start_hash;
+ hinfo.minor_hash = 0;
+ frame = dx_probe(NULL, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+
+ /* Add '.' and '..' from the htree header */
+ if (!start_hash && !start_minor_hash) {
+ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+ if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ goto errout;
+ count++;
+ }
+ if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+ goto errout;
+ count++;
+ }
+
+ while (1) {
+ block = dx_get_block(frame->at);
+ ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+ start_hash, start_minor_hash);
+ if (ret < 0) {
+ err = ret;
+ goto errout;
+ }
+ count += ret;
+ hashval = ~0;
+ ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+ frame, frames, &hashval);
+ *next_hash = hashval;
+ if (ret < 0) {
+ err = ret;
+ goto errout;
+ }
+ /*
+ * Stop if: (a) there are no more entries, or
+ * (b) we have inserted at least one entry and the
+ * next hash value is not a continuation
+ */
+ if ((ret == 0) ||
+ (count && ((hashval & 1) == 0)))
+ break;
+ }
+ dx_release(frames);
+ dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+ "next hash: %x\n", count, *next_hash));
+ return count;
+errout:
+ dx_release(frames);
+ return (err);
+}
+
+
+/*
+ * Directory block splitting, compacting
+ */
+
+/*
+ * Create map of hash values, offsets, and sizes, stored at end of block.
+ * Returns number of entries mapped.
+ */
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail)
+{
+ int count = 0;
+ char *base = (char *) de;
+ struct dx_hash_info h = *hinfo;
+
+ while ((char *) de < base + blocksize) {
+ if (de->name_len && de->inode) {
+ ext4fs_dirhash(de->name, de->name_len, &h);
+ map_tail--;
+ map_tail->hash = h.hash;
+ map_tail->offs = ((char *) de - base)>>2;
+ map_tail->size = le16_to_cpu(de->rec_len);
+ count++;
+ cond_resched();
+ }
+ /* XXX: do we need to check rec_len == 0 case? -Chris */
+ de = ext4_next_entry(de, blocksize);
+ }
+ return count;
+}
+
+/* Sort map by hash value */
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+ struct dx_map_entry *p, *q, *top = map + count - 1;
+ int more;
+ /* Combsort until bubble sort doesn't suck */
+ while (count > 2) {
+ count = count*10/13;
+ if (count - 9 < 2) /* 9, 10 -> 11 */
+ count = 11;
+ for (p = top, q = p - count; q >= map; p--, q--)
+ if (p->hash < q->hash)
+ swap(*p, *q);
+ }
+ /* Garden variety bubble sort */
+ do {
+ more = 0;
+ q = top;
+ while (q-- > map) {
+ if (q[1].hash >= q[0].hash)
+ continue;
+ swap(*(q+1), *q);
+ more = 1;
+ }
+ } while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
+{
+ struct dx_entry *entries = frame->entries;
+ struct dx_entry *old = frame->at, *new = old + 1;
+ int count = dx_get_count(entries);
+
+ assert(count < dx_get_limit(entries));
+ assert(old < entries + count);
+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+ dx_set_hash(new, hash);
+ dx_set_block(new, block);
+ dx_set_count(entries, count + 1);
+}
+
+static void ext4_update_dx_flag(struct inode *inode)
+{
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+
+/*
+ * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT4_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext4_match (int len, const char * const name,
+ struct ext4_dir_entry_2 * de)
+{
+ if (len != de->name_len)
+ return 0;
+ if (!de->inode)
+ return 0;
+ return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head *bh,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 ** res_dir)
+{
+ struct ext4_dir_entry_2 * de;
+ char * dlimit;
+ int de_len;
+ const char *name = d_name->name;
+ int namelen = d_name->len;
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ while ((char *) de < dlimit) {
+ /* this code is executed quadratically often */
+ /* do minimal checking `by hand' */
+
+ if ((char *) de + namelen <= dlimit &&
+ ext4_match (namelen, name, de)) {
+ /* found a match - just to be sure, do a full check */
+ if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+ return -1;
+ *res_dir = de;
+ return 1;
+ }
+ /* prevent looping on a bad block */
+ de_len = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
+ if (de_len <= 0)
+ return -1;
+ offset += de_len;
+ de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+ }
+ return 0;
+}
+
+
+/*
+ * ext4_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 ** res_dir)
+{
+ struct super_block *sb;
+ struct buffer_head *bh_use[NAMEI_RA_SIZE];
+ struct buffer_head *bh, *ret = NULL;
+ ext4_lblk_t start, block, b;
+ const u8 *name = d_name->name;
+ int ra_max = 0; /* Number of bh's in the readahead
+ buffer, bh_use[] */
+ int ra_ptr = 0; /* Current index into readahead
+ buffer */
+ int num = 0;
+ ext4_lblk_t nblocks;
+ int i, err;
+ int namelen;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+ namelen = d_name->len;
+ if (namelen > EXT4_NAME_LEN)
+ return NULL;
+ if ((namelen <= 2) && (name[0] == '.') &&
+ (name[1] == '.' || name[1] == '\0')) {
+ /*
+ * "." or ".." will only be in the first block
+ * NFS may look up ".."; "." should be handled by the VFS
+ */
+ block = start = 0;
+ nblocks = 1;
+ goto restart;
+ }
+ if (is_dx(dir)) {
+ bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
+ /*
+ * On success, or if the error was file not found,
+ * return. Otherwise, fall back to doing a search the
+ * old fashioned way.
+ */
+ if (bh || (err != ERR_BAD_DX_DIR))
+ return bh;
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
+ }
+ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ start = EXT4_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+restart:
+ do {
+ /*
+ * We deal with the read-ahead logic here.
+ */
+ if (ra_ptr >= ra_max) {
+ /* Refill the readahead buffer */
+ ra_ptr = 0;
+ b = block;
+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+ /*
+ * Terminate if we reach the end of the
+ * directory and must wrap, or if our
+ * search has finished at this block.
+ */
+ if (b >= nblocks || (num && block == start)) {
+ bh_use[ra_max] = NULL;
+ break;
+ }
+ num++;
+ bh = ext4_getblk(NULL, dir, b++, 0, &err);
+ bh_use[ra_max] = bh;
+ if (bh)
+ ll_rw_block(READ | REQ_META | REQ_PRIO,
+ 1, &bh);
+ }
+ }
+ if ((bh = bh_use[ra_ptr++]) == NULL)
+ goto next;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ /* read error, skip block & hope for the best */
+ EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+ (unsigned long) block);
+ brelse(bh);
+ goto next;
+ }
+ i = search_dirblock(bh, dir, d_name,
+ block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+ EXT4_I(dir)->i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+ brelse(bh);
+ if (i < 0)
+ goto cleanup_and_exit;
+ }
+ next:
+ if (++block >= nblocks)
+ block = 0;
+ } while (block != start);
+
+ /*
+ * If the directory has grown while we were searching, then
+ * search the last part of the directory before giving up.
+ */
+ block = nblocks;
+ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ if (block < nblocks) {
+ start = 0;
+ goto restart;
+ }
+
+cleanup_and_exit:
+ /* Clean up the read-ahead blocks */
+ for (; ra_ptr < ra_max; ra_ptr++)
+ brelse(bh_use[ra_ptr]);
+ return ret;
+}
+
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir, int *err)
+{
+ struct super_block * sb = dir->i_sb;
+ struct dx_hash_info hinfo;
+ struct dx_frame frames[2], *frame;
+ struct buffer_head *bh;
+ ext4_lblk_t block;
+ int retval;
+
+ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+ return NULL;
+ do {
+ block = dx_get_block(frame->at);
+ if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
+ goto errout;
+
+ retval = search_dirblock(bh, dir, d_name,
+ block << EXT4_BLOCK_SIZE_BITS(sb),
+ res_dir);
+ if (retval == 1) { /* Success! */
+ dx_release(frames);
+ return bh;
+ }
+ brelse(bh);
+ if (retval == -1) {
+ *err = ERR_BAD_DX_DIR;
+ goto errout;
+ }
+
+ /* Check to see if we should continue to search */
+ retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+ frames, NULL);
+ if (retval < 0) {
+ ext4_warning(sb,
+ "error reading index page in directory #%lu",
+ dir->i_ino);
+ *err = retval;
+ goto errout;
+ }
+ } while (retval == 1);
+
+ *err = -ENOENT;
+errout:
+ dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
+ dx_release (frames);
+ return NULL;
+}
+
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *inode;
+ struct ext4_dir_entry_2 *de;
+ struct buffer_head *bh;
+
+ if (dentry->d_name.len > EXT4_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ inode = NULL;
+ if (bh) {
+ __u32 ino = le32_to_cpu(de->inode);
+ brelse(bh);
+ if (!ext4_valid_inum(dir->i_sb, ino)) {
+ EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
+ return ERR_PTR(-EIO);
+ }
+ if (unlikely(ino == dir->i_ino)) {
+ EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
+ dentry->d_name.len,
+ dentry->d_name.name);
+ return ERR_PTR(-EIO);
+ }
+ inode = ext4_iget(dir->i_sb, ino);
+ if (inode == ERR_PTR(-ESTALE)) {
+ EXT4_ERROR_INODE(dir,
+ "deleted inode referenced: %u",
+ ino);
+ return ERR_PTR(-EIO);
+ }
+ }
+ return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext4_get_parent(struct dentry *child)
+{
+ __u32 ino;
+ static const struct qstr dotdot = {
+ .name = "..",
+ .len = 2,
+ };
+ struct ext4_dir_entry_2 * de;
+ struct buffer_head *bh;
+
+ bh = ext4_find_entry(child->d_inode, &dotdot, &de);
+ if (!bh)
+ return ERR_PTR(-ENOENT);
+ ino = le32_to_cpu(de->inode);
+ brelse(bh);
+
+ if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
+ EXT4_ERROR_INODE(child->d_inode,
+ "bad parent inode number: %u", ino);
+ return ERR_PTR(-EIO);
+ }
+
+ return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
+}
+
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+ struct ext4_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+/*
+ * Move count entries from end of map between two memory locations.
+ * Returns pointer to last entry moved.
+ */
+static struct ext4_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+ unsigned blocksize)
+{
+ unsigned rec_len = 0;
+
+ while (count--) {
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+ (from + (map->offs<<2));
+ rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
+ ((struct ext4_dir_entry_2 *) to)->rec_len =
+ ext4_rec_len_to_disk(rec_len, blocksize);
+ de->inode = 0;
+ map++;
+ to += rec_len;
+ }
+ return (struct ext4_dir_entry_2 *) (to - rec_len);
+}
+
+/*
+ * Compact each dir entry in the range to the minimal rec_len.
+ * Returns pointer to last entry in range.
+ */
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+{
+ struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
+ unsigned rec_len = 0;
+
+ prev = to = de;
+ while ((char*)de < base + blocksize) {
+ next = ext4_next_entry(de, blocksize);
+ if (de->inode && de->name_len) {
+ rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ if (de > to)
+ memmove(to, de, rec_len);
+ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+ prev = to;
+ to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
+ }
+ de = next;
+ }
+ return prev;
+}
+
+/*
+ * Split a full leaf block to make room for a new dir entry.
+ * Allocate a new block, and move entries so that they are approx. equally full.
+ * Returns pointer to de in block into which the new entry will be inserted.
+ */
+static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+ struct buffer_head **bh,struct dx_frame *frame,
+ struct dx_hash_info *hinfo, int *error)
+{
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+ struct buffer_head *bh2;
+ ext4_lblk_t newblock;
+ u32 hash2;
+ struct dx_map_entry *map;
+ char *data1 = (*bh)->b_data, *data2;
+ unsigned split, move, size;
+ struct ext4_dir_entry_2 *de = NULL, *de2;
+ int err = 0, i;
+
+ bh2 = ext4_append (handle, dir, &newblock, &err);
+ if (!(bh2)) {
+ brelse(*bh);
+ *bh = NULL;
+ goto errout;
+ }
+
+ BUFFER_TRACE(*bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, *bh);
+ if (err)
+ goto journal_error;
+
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+
+ data2 = bh2->b_data;
+
+ /* create map in the end of data2 block */
+ map = (struct dx_map_entry *) (data2 + blocksize);
+ count = dx_make_map((struct ext4_dir_entry_2 *) data1,
+ blocksize, hinfo, map);
+ map -= count;
+ dx_sort_map(map, count);
+ /* Split the existing block in the middle, size-wise */
+ size = 0;
+ move = 0;
+ for (i = count-1; i >= 0; i--) {
+ /* is more than half of this entry in 2nd half of the block? */
+ if (size + map[i].size/2 > blocksize/2)
+ break;
+ size += map[i].size;
+ move++;
+ }
+ /* map index at which we will split */
+ split = count - move;
+ hash2 = map[split].hash;
+ continued = hash2 == map[split - 1].hash;
+ dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
+ (unsigned long)dx_get_block(frame->at),
+ hash2, split, count-split));
+
+ /* Fancy dance to stay within two buffers */
+ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+ de = dx_pack_dirents(data1, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
+ de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+ blocksize);
+ dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+ dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+
+ /* Which block gets the new entry? */
+ if (hinfo->hash >= hash2)
+ {
+ swap(*bh, bh2);
+ de = de2;
+ }
+ dx_insert_block(frame, hash2 + continued, newblock);
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ if (err)
+ goto journal_error;
+ brelse(bh2);
+ dxtrace(dx_show_index("frame", frame->entries));
+ return de;
+
+journal_error:
+ brelse(*bh);
+ brelse(bh2);
+ *bh = NULL;
+ ext4_std_error(dir->i_sb, err);
+errout:
+ *error = err;
+ return NULL;
+}
+
+/*
+ * Add a new entry into a directory (leaf) block. If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry. If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space. It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+ struct inode *inode, struct ext4_dir_entry_2 *de,
+ struct buffer_head *bh)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned int offset = 0;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ unsigned short reclen;
+ int nlen, rlen, err;
+ char *top;
+
+ reclen = EXT4_DIR_REC_LEN(namelen);
+ if (!de) {
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ top = bh->b_data + blocksize - reclen;
+ while ((char *) de <= top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+ return -EIO;
+ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ if ((de->inode? rlen - nlen: rlen) >= reclen)
+ break;
+ de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+ offset += rlen;
+ }
+ if ((char *) de > top)
+ return -ENOSPC;
+ }
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_std_error(dir->i_sb, err);
+ return err;
+ }
+
+ /* By now the buffer is marked for journaling */
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
+ de = de1;
+ }
+ de->file_type = EXT4_FT_UNKNOWN;
+ if (inode) {
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(dir->i_sb, de, inode->i_mode);
+ } else
+ de->inode = 0;
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext4_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+ if (err)
+ ext4_std_error(dir->i_sb, err);
+ return 0;
+}
+
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+ struct inode *inode, struct buffer_head *bh)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+ struct dx_root *root;
+ struct dx_frame frames[2], *frame;
+ struct dx_entry *entries;
+ struct ext4_dir_entry_2 *de, *de2;
+ char *data1, *top;
+ unsigned len;
+ int retval;
+ unsigned blocksize;
+ struct dx_hash_info hinfo;
+ ext4_lblk_t block;
+ struct fake_dirent *fde;
+
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+ retval = ext4_journal_get_write_access(handle, bh);
+ if (retval) {
+ ext4_std_error(dir->i_sb, retval);
+ brelse(bh);
+ return retval;
+ }
+ root = (struct dx_root *) bh->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
+ fde = &root->dotdot;
+ de = (struct ext4_dir_entry_2 *)((char *)fde +
+ ext4_rec_len_from_disk(fde->rec_len, blocksize));
+ if ((char *) de >= (((char *) root) + blocksize)) {
+ EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
+ brelse(bh);
+ return -EIO;
+ }
+ len = ((char *) root) + blocksize - (char *) de;
+
+ /* Allocate new block for the 0th block's dirents */
+ bh2 = ext4_append(handle, dir, &block, &retval);
+ if (!(bh2)) {
+ brelse(bh);
+ return retval;
+ }
+ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
+ data1 = bh2->b_data;
+
+ memcpy (data1, de, len);
+ de = (struct ext4_dir_entry_2 *) data1;
+ top = data1 + len;
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ de = de2;
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
+ /* Initialize the root; the dot dirents already exist */
+ de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+ blocksize);
+ memset (&root->info, 0, sizeof(root->info));
+ root->info.info_length = sizeof(root->info);
+ root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ entries = root->entries;
+ dx_set_block(entries, 1);
+ dx_set_count(entries, 1);
+ dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
+
+ /* Initialize as for dx_probe */
+ hinfo.hash_version = root->info.hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ ext4fs_dirhash(name, namelen, &hinfo);
+ frame = frames;
+ frame->entries = entries;
+ frame->at = entries;
+ frame->bh = bh;
+ bh = bh2;
+
+ ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ ext4_handle_dirty_metadata(handle, dir, bh);
+
+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+ if (!de) {
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
+ return retval;
+ }
+ dx_release(frames);
+
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
+}
+
+/*
+ * ext4_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext4_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ struct super_block *sb;
+ int retval;
+ int dx_fallback=0;
+ unsigned blocksize;
+ ext4_lblk_t block, blocks;
+
+ sb = dir->i_sb;
+ blocksize = sb->s_blocksize;
+ if (!dentry->d_name.len)
+ return -EINVAL;
+ if (is_dx(dir)) {
+ retval = ext4_dx_add_entry(handle, dentry, inode);
+ if (!retval || (retval != ERR_BAD_DX_DIR))
+ return retval;
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+ dx_fallback++;
+ ext4_mark_inode_dirty(handle, dir);
+ }
+ blocks = dir->i_size >> sb->s_blocksize_bits;
+ for (block = 0; block < blocks; block++) {
+ bh = ext4_bread(handle, dir, block, 0, &retval);
+ if(!bh)
+ return retval;
+ retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (retval != -ENOSPC) {
+ brelse(bh);
+ return retval;
+ }
+
+ if (blocks == 1 && !dx_fallback &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+ return make_indexed_dir(handle, dentry, inode, bh);
+ brelse(bh);
+ }
+ bh = ext4_append(handle, dir, &block, &retval);
+ if (!bh)
+ return retval;
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ if (retval == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+ return retval;
+}
+
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct dx_frame frames[2], *frame;
+ struct dx_entry *entries, *at;
+ struct dx_hash_info hinfo;
+ struct buffer_head *bh;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct ext4_dir_entry_2 *de;
+ int err;
+
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+ entries = frame->entries;
+ at = frame->at;
+
+ if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ goto cleanup;
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto journal_error;
+
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (err != -ENOSPC)
+ goto cleanup;
+
+ /* Block full, should compress but for now just split */
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+ /* Need to split index? */
+ if (dx_get_count(entries) == dx_get_limit(entries)) {
+ ext4_lblk_t newblock;
+ unsigned icount = dx_get_count(entries);
+ int levels = frame - frames;
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
+ if (levels && (dx_get_count(frames->entries) ==
+ dx_get_limit(frames->entries))) {
+ ext4_warning(sb, "Directory index full!");
+ err = -ENOSPC;
+ goto cleanup;
+ }
+ bh2 = ext4_append (handle, dir, &newblock, &err);
+ if (!(bh2))
+ goto cleanup;
+ node2 = (struct dx_node *)(bh2->b_data);
+ entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
+ node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+ sb->s_blocksize);
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+ if (levels) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+ icount1, icount2));
+
+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+ err = ext4_journal_get_write_access(handle,
+ frames[0].bh);
+ if (err)
+ goto journal_error;
+
+ memcpy((char *) entries2, (char *) (entries + icount1),
+ icount2 * sizeof(struct dx_entry));
+ dx_set_count(entries, icount1);
+ dx_set_count(entries2, icount2);
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+ /* Which index block gets the new entry? */
+ if (at - entries >= icount1) {
+ frame->at = at = at - entries - icount1 + entries2;
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ }
+ dx_insert_block(frames + 0, hash2, newblock);
+ dxtrace(dx_show_index("node", frames[1].entries));
+ dxtrace(dx_show_index("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ext4_handle_dirty_metadata(handle, dir, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
+ } else {
+ dxtrace(printk(KERN_DEBUG
+ "Creating second level index...\n"));
+ memcpy((char *) entries2, (char *) entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+ /* Set up root */
+ dx_set_count(entries, 1);
+ dx_set_block(entries + 0, newblock);
+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+ /* Add new access path frame */
+ frame = frames + 1;
+ frame->at = at = at - entries + entries2;
+ frame->entries = entries = entries2;
+ frame->bh = bh2;
+ err = ext4_journal_get_write_access(handle,
+ frame->bh);
+ if (err)
+ goto journal_error;
+ }
+ err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+ if (err) {
+ ext4_std_error(inode->i_sb, err);
+ goto cleanup;
+ }
+ }
+ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+ if (!de)
+ goto cleanup;
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ goto cleanup;
+
+journal_error:
+ ext4_std_error(dir->i_sb, err);
+cleanup:
+ if (bh)
+ brelse(bh);
+ dx_release(frames);
+ return err;
+}
+
+/*
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext4_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh)
+{
+ struct ext4_dir_entry_2 *de, *pde;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ int i, err;
+
+ i = 0;
+ pde = NULL;
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ while (i < bh->b_size) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, i))
+ return -EIO;
+ if (de == de_del) {
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err)) {
+ ext4_std_error(dir->i_sb, err);
+ return err;
+ }
+ if (pde)
+ pde->rec_len = ext4_rec_len_to_disk(
+ ext4_rec_len_from_disk(pde->rec_len,
+ blocksize) +
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize),
+ blocksize);
+ else
+ de->inode = 0;
+ dir->i_version++;
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+ if (unlikely(err)) {
+ ext4_std_error(dir->i_sb, err);
+ return err;
+ }
+ return 0;
+ }
+ i += ext4_rec_len_from_disk(de->rec_len, blocksize);
+ pde = de;
+ de = ext4_next_entry(de, blocksize);
+ }
+ return -ENOENT;
+}
+
+/*
+ * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+ * since this indicates that nlinks count was previously 1.
+ */
+static void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+ inc_nlink(inode);
+ if (is_dx(inode) && inode->i_nlink > 1) {
+ /* limit is 16-bit i_links_count */
+ if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
+ set_nlink(inode, 1);
+ EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+ }
+ }
+}
+
+/*
+ * If a directory had nlink == 1, then we should let it be 1. This indicates
+ * directory has >EXT4_LINK_MAX subdirs.
+ */
+static void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
+}
+
+
+static int ext4_add_nondir(handle_t *handle,
+ struct dentry *dentry, struct inode *inode)
+{
+ int err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext4_mark_inode_dirty(handle, inode);
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+ return 0;
+ }
+ drop_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ struct nameidata *nd)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ err = ext4_add_nondir(handle, dentry, inode);
+ }
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT4_FS_XATTR
+ inode->i_op = &ext4_special_inode_operations;
+#endif
+ err = ext4_add_nondir(handle, dentry, inode);
+ }
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ struct buffer_head *dir_block = NULL;
+ struct ext4_dir_entry_2 *de;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ int err, retries = 0;
+
+ if (EXT4_DIR_LINK_MAX(dir))
+ return -EMLINK;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+ &dentry->d_name, 0, NULL);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ inode->i_op = &ext4_dir_inode_operations;
+ inode->i_fop = &ext4_dir_operations;
+ inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext4_bread(handle, inode, 0, 1, &err);
+ if (!dir_block)
+ goto out_clear_inode;
+ BUFFER_TRACE(dir_block, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out_clear_inode;
+ de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+ de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ blocksize);
+ strcpy(de->name, ".");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+ de = ext4_next_entry(de, blocksize);
+ de->inode = cpu_to_le32(dir->i_ino);
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+ blocksize);
+ de->name_len = 2;
+ strcpy(de->name, "..");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+ set_nlink(inode, 2);
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, dir_block);
+ if (err)
+ goto out_clear_inode;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (!err)
+ err = ext4_add_entry(handle, dentry, inode);
+ if (err) {
+out_clear_inode:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ iput(inode);
+ goto out_stop;
+ }
+ ext4_inc_count(handle, dir);
+ ext4_update_dx_flag(dir);
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (err)
+ goto out_clear_inode;
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+out_stop:
+ brelse(dir_block);
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir(struct inode *inode)
+{
+ unsigned int offset;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de, *de1;
+ struct super_block *sb;
+ int err = 0;
+
+ sb = inode->i_sb;
+ if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
+ !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
+ if (err)
+ EXT4_ERROR_INODE(inode,
+ "error %d reading directory lblock 0", err);
+ else
+ ext4_warning(inode->i_sb,
+ "bad directory (dir #%lu) - no data block",
+ inode->i_ino);
+ return 1;
+ }
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ de1 = ext4_next_entry(de, sb->s_blocksize);
+ if (le32_to_cpu(de->inode) != inode->i_ino ||
+ !le32_to_cpu(de1->inode) ||
+ strcmp(".", de->name) ||
+ strcmp("..", de1->name)) {
+ ext4_warning(inode->i_sb,
+ "bad directory (dir #%lu) - no `.' or `..'",
+ inode->i_ino);
+ brelse(bh);
+ return 1;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+ ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de1, sb->s_blocksize);
+ while (offset < inode->i_size) {
+ if (!bh ||
+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ unsigned int lblock;
+ err = 0;
+ brelse(bh);
+ lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ bh = ext4_bread(NULL, inode, lblock, 0, &err);
+ if (!bh) {
+ if (err)
+ EXT4_ERROR_INODE(inode,
+ "error %d reading directory "
+ "lblock %u", err, lblock);
+ offset += sb->s_blocksize;
+ continue;
+ }
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ }
+ if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
+ de = (struct ext4_dir_entry_2 *)(bh->b_data +
+ sb->s_blocksize);
+ offset = (offset | (sb->s_blocksize - 1)) + 1;
+ continue;
+ }
+ if (le32_to_cpu(de->inode)) {
+ brelse(bh);
+ return 0;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
+ }
+ brelse(bh);
+ return 1;
+}
+
+/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_iloc iloc;
+ int err = 0, rc;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
+ if (!list_empty(&EXT4_I(inode)->i_orphan))
+ goto out_unlock;
+
+ /*
+ * Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. Note that we either
+ * hold i_mutex, or the inode can not be referenced from outside,
+ * so i_nlink should not be bumped due to race
+ */
+ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_unlock;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_unlock;
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+ goto mem_insert;
+
+ /* Insert this inode at the head of the on-disk orphan list... */
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+ EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+
+ /* Only add to the head of the in-memory list if all the
+ * previous operations succeeded. If the orphan_add is going to
+ * fail (possibly taking the journal offline), we can't risk
+ * leaving the inode on the orphan list: stray orphan-list
+ * entries can cause panics at unmount time.
+ *
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+mem_insert:
+ if (!err)
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %lu will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+ mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi;
+ __u32 ino_next;
+ struct ext4_iloc iloc;
+ int err = 0;
+
+ /* ext4_handle_valid() assumes a valid handle_t pointer */
+ if (handle && !ext4_handle_valid(handle))
+ return 0;
+
+ mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+ if (list_empty(&ei->i_orphan))
+ goto out;
+
+ ino_next = NEXT_ORPHAN(inode);
+ prev = ei->i_orphan.prev;
+ sbi = EXT4_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+ list_del_init(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (sbi->s_journal && !handle)
+ goto out;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_err;
+
+ if (prev == &sbi->s_orphan) {
+ jbd_debug(4, "superblock will point to %u\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto out_brelse;
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
+ &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+ jbd_debug(4, "orphan inode %lu will point to %u\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+out_err:
+ ext4_std_error(inode->i_sb, err);
+out:
+ mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+ return err;
+
+out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+}
+
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int retval;
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ handle_t *handle;
+
+ /* Initialize quotas before so that eventual writes go in
+ * separate transaction */
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
+ handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ retval = -ENOENT;
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ if (!bh)
+ goto end_rmdir;
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode = dentry->d_inode;
+
+ retval = -EIO;
+ if (le32_to_cpu(de->inode) != inode->i_ino)
+ goto end_rmdir;
+
+ retval = -ENOTEMPTY;
+ if (!empty_dir(inode))
+ goto end_rmdir;
+
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+ if (!EXT4_DIR_LINK_EMPTY(inode))
+ ext4_warning(inode->i_sb,
+ "empty directory has too many links (%d)",
+ inode->i_nlink);
+ inode->i_version++;
+ clear_nlink(inode);
+ /* There's no need to set i_disksize: the fact that i_nlink is
+ * zero will ensure that the right thing happens during any
+ * recovery. */
+ inode->i_size = 0;
+ ext4_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_dec_count(handle, dir);
+ ext4_update_dx_flag(dir);
+ ext4_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+ ext4_journal_stop(handle);
+ brelse(bh);
+ return retval;
+}
+
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int retval;
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ handle_t *handle;
+
+ trace_ext4_unlink_enter(dir, dentry);
+ /* Initialize quotas before so that eventual writes go
+ * in separate transaction */
+ dquot_initialize(dir);
+ dquot_initialize(dentry->d_inode);
+
+ handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ retval = -ENOENT;
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ if (!bh)
+ goto end_unlink;
+
+ inode = dentry->d_inode;
+
+ retval = -EIO;
+ if (le32_to_cpu(de->inode) != inode->i_ino)
+ goto end_unlink;
+
+ if (!inode->i_nlink) {
+ ext4_warning(inode->i_sb,
+ "Deleting nonexistent file (%lu), %d",
+ inode->i_ino, inode->i_nlink);
+ set_nlink(inode, 1);
+ }
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ ext4_mark_inode_dirty(handle, dir);
+ drop_nlink(inode);
+ if (!inode->i_nlink)
+ ext4_orphan_add(handle, inode);
+ inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ retval = 0;
+
+end_unlink:
+ ext4_journal_stop(handle);
+ brelse(bh);
+ trace_ext4_unlink_exit(dentry, retval);
+ return retval;
+}
+
+static int ext4_symlink(struct inode *dir,
+ struct dentry *dentry, const char *symname)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int l, err, retries = 0;
+ int credits;
+
+ l = strlen(symname)+1;
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+ dquot_initialize(dir);
+
+ if (l > EXT4_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks, and
+ * possibly selinux xattr blocks.
+ */
+ credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+ EXT4_XATTR_TRANS_BLOCKS;
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
+retry:
+ handle = ext4_journal_start(dir, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0, NULL);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ if (l > EXT4_N_BLOCKS * 4) {
+ inode->i_op = &ext4_symlink_inode_operations;
+ ext4_set_aops(inode);
+ /*
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext4_write_begin() which can wait
+ * for transaction commit if we are running out of space
+ * and thus we deadlock. So we have to stop transaction now
+ * and restart it when symlink contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
+ */
+ drop_nlink(inode);
+ err = ext4_orphan_add(handle, inode);
+ ext4_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
+ err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+ * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ set_nlink(inode, 1);
+ err = ext4_orphan_del(handle, inode);
+ if (err) {
+ ext4_journal_stop(handle);
+ clear_nlink(inode);
+ goto err_drop_inode;
+ }
+ } else {
+ /* clear the extent format for fast symlink */
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
+ inode->i_size = l-1;
+ }
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_add_nondir(handle, dentry, inode);
+out_stop:
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
+}
+
+static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
+{
+ handle_t *handle;
+ struct inode *inode = old_dentry->d_inode;
+ int err, retries = 0;
+
+ if (inode->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+ dquot_initialize(dir);
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode->i_ctime = ext4_current_time(inode);
+ ext4_inc_count(handle, inode);
+ ihold(inode);
+
+ err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext4_mark_inode_dirty(handle, inode);
+ d_instantiate(dentry, inode);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+#define PARENT_INO(buffer, size) \
+ (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ handle_t *handle;
+ struct inode *old_inode, *new_inode;
+ struct buffer_head *old_bh, *new_bh, *dir_bh;
+ struct ext4_dir_entry_2 *old_de, *new_de;
+ int retval, force_da_alloc = 0;
+
+ dquot_initialize(old_dir);
+ dquot_initialize(new_dir);
+
+ old_bh = new_bh = dir_bh = NULL;
+
+ /* Initialize quotas before so that eventual writes go
+ * in separate transaction */
+ if (new_dentry->d_inode)
+ dquot_initialize(new_dentry->d_inode);
+ handle = ext4_journal_start(old_dir, 2 *
+ EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ ext4_handle_sync(handle);
+
+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ old_inode = old_dentry->d_inode;
+ retval = -ENOENT;
+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ goto end_rename;
+
+ new_inode = new_dentry->d_inode;
+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
+ if (new_bh) {
+ if (!new_inode) {
+ brelse(new_bh);
+ new_bh = NULL;
+ }
+ }
+ if (S_ISDIR(old_inode->i_mode)) {
+ if (new_inode) {
+ retval = -ENOTEMPTY;
+ if (!empty_dir(new_inode))
+ goto end_rename;
+ }
+ retval = -EIO;
+ dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
+ if (!dir_bh)
+ goto end_rename;
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+ old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+ if (!new_inode && new_dir != old_dir &&
+ EXT4_DIR_LINK_MAX(new_dir))
+ goto end_rename;
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ retval = ext4_journal_get_write_access(handle, dir_bh);
+ if (retval)
+ goto end_rename;
+ }
+ if (!new_bh) {
+ retval = ext4_add_entry(handle, new_dentry, old_inode);
+ if (retval)
+ goto end_rename;
+ } else {
+ BUFFER_TRACE(new_bh, "get write access");
+ retval = ext4_journal_get_write_access(handle, new_bh);
+ if (retval)
+ goto end_rename;
+ new_de->inode = cpu_to_le32(old_inode->i_ino);
+ if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+ EXT4_FEATURE_INCOMPAT_FILETYPE))
+ new_de->file_type = old_de->file_type;
+ new_dir->i_version++;
+ new_dir->i_ctime = new_dir->i_mtime =
+ ext4_current_time(new_dir);
+ ext4_mark_inode_dirty(handle, new_dir);
+ BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
+ retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+ if (unlikely(retval)) {
+ ext4_std_error(new_dir->i_sb, retval);
+ goto end_rename;
+ }
+ brelse(new_bh);
+ new_bh = NULL;
+ }
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old_inode->i_ctime = ext4_current_time(old_inode);
+ ext4_mark_inode_dirty(handle, old_inode);
+
+ /*
+ * ok, that's it
+ */
+ if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+ old_de->name_len != old_dentry->d_name.len ||
+ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+ (retval = ext4_delete_entry(handle, old_dir,
+ old_de, old_bh)) == -ENOENT) {
+ /* old_de could have moved from under us during htree split, so
+ * make sure that we are deleting the right entry. We might
+ * also be pointing to a stale entry in the unused part of
+ * old_bh so just checking inum and the name isn't enough. */
+ struct buffer_head *old_bh2;
+ struct ext4_dir_entry_2 *old_de2;
+
+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
+ if (old_bh2) {
+ retval = ext4_delete_entry(handle, old_dir,
+ old_de2, old_bh2);
+ brelse(old_bh2);
+ }
+ }
+ if (retval) {
+ ext4_warning(old_dir->i_sb,
+ "Deleting old file (%lu), %d, error=%d",
+ old_dir->i_ino, old_dir->i_nlink, retval);
+ }
+
+ if (new_inode) {
+ ext4_dec_count(handle, new_inode);
+ new_inode->i_ctime = ext4_current_time(new_inode);
+ }
+ old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
+ ext4_update_dx_flag(old_dir);
+ if (dir_bh) {
+ PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+ cpu_to_le32(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
+ retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
+ if (retval) {
+ ext4_std_error(old_dir->i_sb, retval);
+ goto end_rename;
+ }
+ ext4_dec_count(handle, old_dir);
+ if (new_inode) {
+ /* checked empty_dir above, can't have another parent,
+ * ext4_dec_count() won't work for many-linked dirs */
+ clear_nlink(new_inode);
+ } else {
+ ext4_inc_count(handle, new_dir);
+ ext4_update_dx_flag(new_dir);
+ ext4_mark_inode_dirty(handle, new_dir);
+ }
+ }
+ ext4_mark_inode_dirty(handle, old_dir);
+ if (new_inode) {
+ ext4_mark_inode_dirty(handle, new_inode);
+ if (!new_inode->i_nlink)
+ ext4_orphan_add(handle, new_inode);
+ if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+ force_da_alloc = 1;
+ }
+ retval = 0;
+
+end_rename:
+ brelse(dir_bh);
+ brelse(old_bh);
+ brelse(new_bh);
+ ext4_journal_stop(handle);
+ if (retval == 0 && force_da_alloc)
+ ext4_alloc_da_blocks(old_inode);
+ return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+const struct inode_operations ext4_dir_inode_operations = {
+ .create = ext4_create,
+ .lookup = ext4_lookup,
+ .link = ext4_link,
+ .unlink = ext4_unlink,
+ .symlink = ext4_symlink,
+ .mkdir = ext4_mkdir,
+ .rmdir = ext4_rmdir,
+ .mknod = ext4_mknod,
+ .rename = ext4_rename,
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .get_acl = ext4_get_acl,
+ .fiemap = ext4_fiemap,
+};
+
+const struct inode_operations ext4_special_inode_operations = {
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .get_acl = ext4_get_acl,
+};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 00000000..dcdeef16
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,433 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "ext4_extents.h"
+
+static struct kmem_cache *io_page_cachep, *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+ io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
+ if (io_page_cachep == NULL)
+ return -ENOMEM;
+ io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+ if (io_end_cachep == NULL) {
+ kmem_cache_destroy(io_page_cachep);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+ kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_page_cachep);
+}
+
+void ext4_ioend_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
+static void put_io_page(struct ext4_io_page *io_page)
+{
+ if (atomic_dec_and_test(&io_page->p_count)) {
+ end_page_writeback(io_page->p_page);
+ put_page(io_page->p_page);
+ kmem_cache_free(io_page_cachep, io_page);
+ }
+}
+
+void ext4_free_io_end(ext4_io_end_t *io)
+{
+ int i;
+
+ BUG_ON(!io);
+ if (io->page)
+ put_page(io->page);
+ for (i = 0; i < io->num_io_pages; i++)
+ put_io_page(io->pages[i]);
+ io->num_io_pages = 0;
+ if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count))
+ wake_up_all(ext4_ioend_wq(io->inode));
+ kmem_cache_free(io_end_cachep, io);
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ *
+ * Called with inode->i_mutex; we depend on this when we manipulate
+ * io->flag, since we could otherwise race with ext4_flush_completed_IO()
+ */
+int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ ssize_t size = io->size;
+ int ret = 0;
+
+ ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+ if (ret < 0) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "failed to convert unwritten extents to written "
+ "extents -- potential data loss! "
+ "(inode %lu, offset %llu, size %zd, error %d)",
+ inode->i_ino, offset, size, ret);
+ }
+
+ if (io->iocb)
+ aio_complete(io->iocb, io->result, 0);
+
+ if (io->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(inode);
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
+ wake_up_all(ext4_ioend_wq(io->inode));
+ return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (io->flag & EXT4_IO_END_IN_FSYNC)
+ goto requeue;
+ if (list_empty(&io->list)) {
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ goto free;
+ }
+
+ if (!mutex_trylock(&inode->i_mutex)) {
+ bool was_queued;
+requeue:
+ was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
+ io->flag |= EXT4_IO_END_QUEUED;
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ /*
+ * Requeue the work instead of waiting so that the work
+ * items queued after this can be processed.
+ */
+ queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
+ /*
+ * To prevent the ext4-dio-unwritten thread from keeping
+ * requeueing end_io requests and occupying cpu for too long,
+ * yield the cpu if it sees an end_io request that has already
+ * been requeued.
+ */
+ if (was_queued)
+ yield();
+ return;
+ }
+ list_del_init(&io->list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ (void) ext4_end_io_nolock(io);
+ mutex_unlock(&inode->i_mutex);
+free:
+ ext4_free_io_end(io);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+ ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+ if (io) {
+ atomic_inc(&EXT4_I(inode)->i_ioend_count);
+ io->inode = inode;
+ INIT_WORK(&io->work, ext4_end_io_work);
+ INIT_LIST_HEAD(&io->list);
+ }
+ return io;
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c. This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message. We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+ char b[BDEVNAME_SIZE];
+ printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+ bdevname(bh->b_bdev, b),
+ (unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_end_bio(struct bio *bio, int error)
+{
+ ext4_io_end_t *io_end = bio->bi_private;
+ struct workqueue_struct *wq;
+ struct inode *inode;
+ unsigned long flags;
+ int i;
+ sector_t bi_sector = bio->bi_sector;
+
+ BUG_ON(!io_end);
+ bio->bi_private = NULL;
+ bio->bi_end_io = NULL;
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ error = 0;
+ bio_put(bio);
+
+ for (i = 0; i < io_end->num_io_pages; i++) {
+ struct page *page = io_end->pages[i]->p_page;
+ struct buffer_head *bh, *head;
+ loff_t offset;
+ loff_t io_end_offset;
+
+ if (error) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ head = page_buffers(page);
+ BUG_ON(!head);
+
+ io_end_offset = io_end->offset + io_end->size;
+
+ offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
+ bh = head;
+ do {
+ if ((offset >= io_end->offset) &&
+ (offset+bh->b_size <= io_end_offset))
+ buffer_io_error(bh);
+
+ offset += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+
+ put_io_page(io_end->pages[i]);
+ }
+ io_end->num_io_pages = 0;
+ inode = io_end->inode;
+
+ if (error) {
+ io_end->flag |= EXT4_IO_END_ERROR;
+ ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ "(offset %llu size %ld starting block %llu)",
+ inode->i_ino,
+ (unsigned long long) io_end->offset,
+ (long) io_end->size,
+ (unsigned long long)
+ bi_sector >> (inode->i_blkbits - 9));
+ }
+
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ ext4_free_io_end(io_end);
+ return;
+ }
+
+ /* Add the io_end to per-inode completed io list*/
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+ wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+ struct bio *bio = io->io_bio;
+
+ if (bio) {
+ bio_get(io->io_bio);
+ submit_bio(io->io_op, io->io_bio);
+ BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+ bio_put(io->io_bio);
+ }
+ io->io_bio = NULL;
+ io->io_op = 0;
+ io->io_end = NULL;
+}
+
+static int io_submit_init(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct writeback_control *wbc,
+ struct buffer_head *bh)
+{
+ ext4_io_end_t *io_end;
+ struct page *page = bh->b_page;
+ int nvecs = bio_get_nr_vecs(bh->b_bdev);
+ struct bio *bio;
+
+ io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end)
+ return -ENOMEM;
+ bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+ bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ bio->bi_private = io->io_end = io_end;
+ bio->bi_end_io = ext4_end_bio;
+
+ io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
+
+ io->io_bio = bio;
+ io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+ io->io_next_block = bh->b_blocknr;
+ return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+ struct ext4_io_page *io_page,
+ struct inode *inode,
+ struct writeback_control *wbc,
+ struct buffer_head *bh)
+{
+ ext4_io_end_t *io_end;
+ int ret;
+
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+
+ if (!buffer_mapped(bh) || buffer_delay(bh)) {
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ if (io->io_bio)
+ ext4_io_submit(io);
+ return 0;
+ }
+
+ if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+ ext4_io_submit(io);
+ }
+ if (io->io_bio == NULL) {
+ ret = io_submit_init(io, inode, wbc, bh);
+ if (ret)
+ return ret;
+ }
+ io_end = io->io_end;
+ if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
+ (io_end->pages[io_end->num_io_pages-1] != io_page))
+ goto submit_and_retry;
+ if (buffer_uninit(bh))
+ ext4_set_io_unwritten_flag(inode, io_end);
+ io->io_end->size += bh->b_size;
+ io->io_next_block++;
+ ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+ if (ret != bh->b_size)
+ goto submit_and_retry;
+ if ((io_end->num_io_pages == 0) ||
+ (io_end->pages[io_end->num_io_pages-1] != io_page)) {
+ io_end->pages[io_end->num_io_pages++] = io_page;
+ atomic_inc(&io_page->p_count);
+ }
+ return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+ struct page *page,
+ int len,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ unsigned block_start, block_end, blocksize;
+ struct ext4_io_page *io_page;
+ struct buffer_head *bh, *head;
+ int ret = 0;
+
+ blocksize = 1 << inode->i_blkbits;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+
+ io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
+ if (!io_page) {
+ set_page_dirty(page);
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ io_page->p_page = page;
+ atomic_set(&io_page->p_count, 1);
+ get_page(page);
+ set_page_writeback(page);
+ ClearPageError(page);
+
+ for (bh = head = page_buffers(page), block_start = 0;
+ bh != head || !block_start;
+ block_start = block_end, bh = bh->b_this_page) {
+
+ block_end = block_start + blocksize;
+ if (block_start >= len) {
+ /*
+ * Comments copied from block_write_full_page_endio:
+ *
+ * The page straddles i_size. It must be zeroed out on
+ * each and every writepage invocation because it may
+ * be mmapped. "A file is mapped in multiples of the
+ * page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when
+ * mapped, and writes to that region are not written
+ * out to the file."
+ */
+ zero_user_segment(page, block_start, block_end);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ clear_buffer_dirty(bh);
+ ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+ if (ret) {
+ /*
+ * We only get here on ENOMEM. Not much else
+ * we can do but mark the page as dirty, and
+ * better luck next time.
+ */
+ set_page_dirty(page);
+ break;
+ }
+ }
+ unlock_page(page);
+ /*
+ * If the page was truncated before we could do the writeback,
+ * or we had a memory allocation error while trying to write
+ * the first buffer head, we won't have submitted any pages for
+ * I/O. In that case we need to make sure we've cleared the
+ * PageWriteback bit from the page to prevent the system from
+ * wedging later on.
+ */
+ put_io_page(io_page);
+ return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 00000000..53589ff8
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1689 @@
+/*
+ * linux/fs/ext4/resize.c
+ *
+ * Support for resizing an ext4 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT4FS_DEBUG
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include "ext4_jbd2.h"
+
+int ext4_resize_begin(struct super_block *sb)
+{
+ int ret = 0;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ /*
+ * We are not allowed to do online-resizing on a filesystem mounted
+ * with error, because it can destroy the filesystem easily.
+ */
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ ext4_warning(sb, "There are errors in the filesystem, "
+ "so online resizing is not allowed\n");
+ return -EPERM;
+ }
+
+ if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+ ret = -EBUSY;
+
+ return ret;
+}
+
+void ext4_resize_end(struct super_block *sb)
+{
+ clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+ smp_mb__after_clear_bit();
+}
+
+#define outside(b, first, last) ((b) < (first) || (b) >= (last))
+#define inside(b, first, last) ((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+ struct ext4_new_group_data *input)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t start = ext4_blocks_count(es);
+ ext4_fsblk_t end = start + input->blocks_count;
+ ext4_group_t group = input->group;
+ ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+ unsigned overhead = ext4_bg_has_super(sb, group) ?
+ (1 + ext4_bg_num_gdb(sb, group) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ ext4_fsblk_t metaend = start + overhead;
+ struct buffer_head *bh = NULL;
+ ext4_grpblk_t free_blocks_count, offset;
+ int err = -EINVAL;
+
+ input->free_blocks_count = free_blocks_count =
+ input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
+ "(%d free, %u reserved)\n",
+ ext4_bg_has_super(sb, input->group) ? "normal" :
+ "no-super", input->group, input->blocks_count,
+ free_blocks_count, input->reserved_blocks);
+
+ ext4_get_group_no_and_offset(sb, start, NULL, &offset);
+ if (group != sbi->s_groups_count)
+ ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+ input->group, sbi->s_groups_count);
+ else if (offset != 0)
+ ext4_warning(sb, "Last group not full");
+ else if (input->reserved_blocks > input->blocks_count / 5)
+ ext4_warning(sb, "Reserved blocks too high (%u)",
+ input->reserved_blocks);
+ else if (free_blocks_count < 0)
+ ext4_warning(sb, "Bad blocks count %u",
+ input->blocks_count);
+ else if (!(bh = sb_bread(sb, end - 1)))
+ ext4_warning(sb, "Cannot read last block (%llu)",
+ end - 1);
+ else if (outside(input->block_bitmap, start, end))
+ ext4_warning(sb, "Block bitmap not in group (block %llu)",
+ (unsigned long long)input->block_bitmap);
+ else if (outside(input->inode_bitmap, start, end))
+ ext4_warning(sb, "Inode bitmap not in group (block %llu)",
+ (unsigned long long)input->inode_bitmap);
+ else if (outside(input->inode_table, start, end) ||
+ outside(itend - 1, start, end))
+ ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
+ (unsigned long long)input->inode_table, itend - 1);
+ else if (input->inode_bitmap == input->block_bitmap)
+ ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
+ (unsigned long long)input->block_bitmap);
+ else if (inside(input->block_bitmap, input->inode_table, itend))
+ ext4_warning(sb, "Block bitmap (%llu) in inode table "
+ "(%llu-%llu)",
+ (unsigned long long)input->block_bitmap,
+ (unsigned long long)input->inode_table, itend - 1);
+ else if (inside(input->inode_bitmap, input->inode_table, itend))
+ ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+ "(%llu-%llu)",
+ (unsigned long long)input->inode_bitmap,
+ (unsigned long long)input->inode_table, itend - 1);
+ else if (inside(input->block_bitmap, start, metaend))
+ ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
+ (unsigned long long)input->block_bitmap,
+ start, metaend - 1);
+ else if (inside(input->inode_bitmap, start, metaend))
+ ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
+ (unsigned long long)input->inode_bitmap,
+ start, metaend - 1);
+ else if (inside(input->inode_table, start, metaend) ||
+ inside(itend - 1, start, metaend))
+ ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+ "(%llu-%llu)",
+ (unsigned long long)input->inode_table,
+ itend - 1, start, metaend - 1);
+ else
+ err = 0;
+ brelse(bh);
+
+ return err;
+}
+
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+ struct ext4_new_group_data *groups; /* new_group_data for groups
+ in the flex group */
+ __u16 *bg_flags; /* block group flags of groups
+ in @groups */
+ ext4_group_t count; /* number of groups in @groups
+ */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+ struct ext4_new_flex_group_data *flex_gd;
+
+ flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+ if (flex_gd == NULL)
+ goto out3;
+
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ goto out2;
+ flex_gd->count = flexbg_size;
+
+ flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+ flexbg_size, GFP_NOFS);
+ if (flex_gd->groups == NULL)
+ goto out2;
+
+ flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+ if (flex_gd->bg_flags == NULL)
+ goto out1;
+
+ return flex_gd;
+
+out1:
+ kfree(flex_gd->groups);
+out2:
+ kfree(flex_gd);
+out3:
+ return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+ kfree(flex_gd->bg_flags);
+ kfree(flex_gd->groups);
+ kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize. Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ */
+static void ext4_alloc_group_tables(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ int flexbg_size)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ ext4_fsblk_t start_blk;
+ ext4_fsblk_t last_blk;
+ ext4_group_t src_group;
+ ext4_group_t bb_index = 0;
+ ext4_group_t ib_index = 0;
+ ext4_group_t it_index = 0;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ unsigned overhead;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+ src_group = group_data[0].group;
+ last_group = src_group + flex_gd->count - 1;
+
+ BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+ (last_group & ~(flexbg_size - 1))));
+next_group:
+ group = group_data[0].group;
+ start_blk = ext4_group_first_block_no(sb, src_group);
+ last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+ overhead = ext4_bg_has_super(sb, src_group) ?
+ (1 + ext4_bg_num_gdb(sb, src_group) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+
+ start_blk += overhead;
+
+ BUG_ON(src_group >= group_data[0].group + flex_gd->count);
+ /* We collect contiguous blocks as much as possible. */
+ src_group++;
+ for (; src_group <= last_group; src_group++)
+ if (!ext4_bg_has_super(sb, src_group))
+ last_blk += group_data[src_group - group].blocks_count;
+ else
+ break;
+
+ /* Allocate block bitmaps */
+ for (; bb_index < flex_gd->count; bb_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[bb_index].block_bitmap = start_blk++;
+ ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ }
+
+ /* Allocate inode bitmaps */
+ for (; ib_index < flex_gd->count; ib_index++) {
+ if (start_blk >= last_blk)
+ goto next_group;
+ group_data[ib_index].inode_bitmap = start_blk++;
+ ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count--;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ }
+
+ /* Allocate inode tables */
+ for (; it_index < flex_gd->count; it_index++) {
+ if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+ goto next_group;
+ group_data[it_index].inode_table = start_blk;
+ ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+ group -= group_data[0].group;
+ group_data[group].free_blocks_count -=
+ EXT4_SB(sb)->s_itb_per_group;
+ if (flexbg_size > 1)
+ flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ start_blk += EXT4_SB(sb)->s_itb_per_group;
+ }
+
+ if (test_opt(sb, DEBUG)) {
+ int i;
+ group = group_data[0].group;
+
+ printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+ "%d groups, flexbg size is %d:\n", flex_gd->count,
+ flexbg_size);
+
+ for (i = 0; i < flex_gd->count; i++) {
+ printk(KERN_DEBUG "adding %s group %u: %u "
+ "blocks (%d free)\n",
+ ext4_bg_has_super(sb, group + i) ? "normal" :
+ "no-super", group + i,
+ group_data[i].blocks_count,
+ group_data[i].free_blocks_count);
+ }
+ }
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t blk)
+{
+ struct buffer_head *bh;
+ int err;
+
+ bh = sb_getblk(sb, blk);
+ if (!bh)
+ return ERR_PTR(-EIO);
+ if ((err = ext4_journal_get_write_access(handle, bh))) {
+ brelse(bh);
+ bh = ERR_PTR(err);
+ } else {
+ memset(bh->b_data, 0, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ }
+
+ return bh;
+}
+
+/*
+ * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for block_bitmap modifications.
+ */
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
+{
+ int err;
+
+ if (ext4_handle_has_enough_credits(handle, thresh))
+ return 0;
+
+ err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
+ if (err < 0)
+ return err;
+ if (err) {
+ err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t block, ext4_group_t count)
+{
+ ext4_group_t count2;
+
+ ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+ for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_fsblk_t start;
+ struct buffer_head *bh;
+ ext4_group_t group;
+ int err;
+
+ ext4_get_group_no_and_offset(sb, block, &group, NULL);
+ start = ext4_group_first_block_no(sb, group);
+ group -= flex_gd->groups[0].group;
+
+ count2 = sb->s_blocksize * 8 - (block - start);
+ if (count2 > count)
+ count2 = count;
+
+ if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+ BUG_ON(flex_gd->count > 1);
+ continue;
+ }
+
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ return err;
+
+ bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+ if (!bh)
+ return -EIO;
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ return err;
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+ block - start, count2);
+ ext4_set_bits(bh->b_data, block - start, count2);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ return err;
+ brelse(bh);
+ }
+
+ return 0;
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem. We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ * 1. copy super block and GDT, and initialize group tables if necessary.
+ * In this step, we only set bits in blocks bitmaps for blocks taken by
+ * super block and GDT.
+ * 2. allocate group tables in block bitmaps, that is, set bits in block
+ * bitmap for blocks taken by group tables.
+ */
+static int setup_new_flex_group_blocks(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+ ext4_fsblk_t start;
+ ext4_fsblk_t block;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ __u16 *bg_flags = flex_gd->bg_flags;
+ handle_t *handle;
+ ext4_group_t group, count;
+ struct buffer_head *bh = NULL;
+ int reserved_gdb, i, j, err = 0, err2;
+
+ BUG_ON(!flex_gd->count || !group_data ||
+ group_data[0].group != sbi->s_groups_count);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+
+ /* This transaction may be extended/restarted along the way */
+ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ group = group_data[0].group;
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ unsigned long gdblocks;
+
+ gdblocks = ext4_bg_num_gdb(sb, group);
+ start = ext4_group_first_block_no(sb, group);
+
+ /* Copy all of the GDT blocks into the backup in this group */
+ for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
+ struct buffer_head *gdb;
+
+ ext4_debug("update backup group %#04llx\n", block);
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ gdb = sb_getblk(sb, block);
+ if (!gdb) {
+ err = -EIO;
+ goto out;
+ }
+
+ err = ext4_journal_get_write_access(handle, gdb);
+ if (err) {
+ brelse(gdb);
+ goto out;
+ }
+ memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+ gdb->b_size);
+ set_buffer_uptodate(gdb);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+ if (unlikely(err)) {
+ brelse(gdb);
+ goto out;
+ }
+ brelse(gdb);
+ }
+
+ /* Zero out all of the reserved backup group descriptor
+ * table blocks
+ */
+ if (ext4_bg_has_super(sb, group)) {
+ err = sb_issue_zeroout(sb, gdblocks + start + 1,
+ reserved_gdb, GFP_NOFS);
+ if (err)
+ goto out;
+ }
+
+ /* Initialize group tables of the grop @group */
+ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+ goto handle_bb;
+
+ /* Zero out all of the inode table blocks */
+ block = group_data[i].inode_table;
+ ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+ block, sbi->s_itb_per_group);
+ err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+ GFP_NOFS);
+ if (err)
+ goto out;
+
+handle_bb:
+ if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+ goto handle_ib;
+
+ /* Initialize block bitmap of the @group */
+ block = group_data[i].block_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
+ if (ext4_bg_has_super(sb, group)) {
+ ext4_debug("mark backup superblock %#04llx (+0)\n",
+ start);
+ ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
+ 1);
+ }
+ ext4_mark_bitmap_end(group_data[i].blocks_count,
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+
+handle_ib:
+ if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+ continue;
+
+ /* Initialize inode bitmap of the @group */
+ block = group_data[i].inode_bitmap;
+ err = extend_or_restart_transaction(handle, 1);
+ if (err)
+ goto out;
+ /* Mark unused entries in inode bitmap used */
+ bh = bclean(handle, sb, block);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ goto out;
+ }
+
+ ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+ sb->s_blocksize * 8, bh->b_data);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (err)
+ goto out;
+ brelse(bh);
+ }
+ bh = NULL;
+
+ /* Mark group tables in block bitmap */
+ for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+ count = group_table_count[j];
+ start = (&group_data[0].block_bitmap)[j];
+ block = start;
+ for (i = 1; i < flex_gd->count; i++) {
+ block += group_table_count[j];
+ if (block == (&group_data[i].block_bitmap)[j]) {
+ count += group_table_count[j];
+ continue;
+ }
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ count = group_table_count[j];
+ start = group_data[i].block_bitmap;
+ block = start;
+ }
+
+ if (count) {
+ err = set_flexbg_block_bitmap(sb, handle,
+ flex_gd, start, count);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ brelse(bh);
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+
+ return err;
+}
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time. In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
+ unsigned *five, unsigned *seven)
+{
+ unsigned *min = three;
+ int mult = 3;
+ unsigned ret;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ ret = *min;
+ *min += 1;
+ return ret;
+ }
+
+ if (*five < *min) {
+ min = five;
+ mult = 5;
+ }
+ if (*seven < *min) {
+ min = seven;
+ mult = 7;
+ }
+
+ ret = *min;
+ *min *= mult;
+
+ return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order. Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+ ext4_group_t end,
+ struct buffer_head *primary)
+{
+ const ext4_fsblk_t blk = primary->b_blocknr;
+ unsigned three = 1;
+ unsigned five = 5;
+ unsigned seven = 7;
+ unsigned grp;
+ __le32 *p = (__le32 *)primary->b_data;
+ int gdbackups = 0;
+
+ while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
+ if (le32_to_cpu(*p++) !=
+ grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
+ ext4_warning(sb, "reserved GDT %llu"
+ " missing grp %d (%llu)",
+ blk, grp,
+ grp *
+ (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+ blk);
+ return -EINVAL;
+ }
+ if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
+ return -EFBIG;
+ }
+
+ return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode. The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order. Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+ ext4_group_t group)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+ ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+ struct buffer_head **o_group_desc, **n_group_desc;
+ struct buffer_head *dind;
+ struct buffer_head *gdb_bh;
+ int gdbackups;
+ struct ext4_iloc iloc;
+ __le32 *data;
+ int err;
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG
+ "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
+ gdb_num);
+
+ /*
+ * If we are not using the primary superblock/GDT copy don't resize,
+ * because the user tools have no way of handling this. Probably a
+ * bad time to do it anyways.
+ */
+ if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ ext4_warning(sb, "won't resize using backup superblock at %llu",
+ (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+ return -EPERM;
+ }
+
+ gdb_bh = sb_bread(sb, gdblock);
+ if (!gdb_bh)
+ return -EIO;
+
+ gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
+ if (gdbackups < 0) {
+ err = gdbackups;
+ goto exit_bh;
+ }
+
+ data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+ dind = sb_bread(sb, le32_to_cpu(*data));
+ if (!dind) {
+ err = -EIO;
+ goto exit_bh;
+ }
+
+ data = (__le32 *)dind->b_data;
+ if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
+ ext4_warning(sb, "new group %u GDT block %llu not reserved",
+ group, gdblock);
+ err = -EINVAL;
+ goto exit_dind;
+ }
+
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (unlikely(err))
+ goto exit_dind;
+
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (unlikely(err))
+ goto exit_sbh;
+
+ err = ext4_journal_get_write_access(handle, dind);
+ if (unlikely(err))
+ ext4_std_error(sb, err);
+
+ /* ext4_reserve_inode_write() gets a reference on the iloc */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (unlikely(err))
+ goto exit_dindj;
+
+ n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+ sizeof(struct buffer_head *),
+ GFP_NOFS);
+ if (!n_group_desc) {
+ err = -ENOMEM;
+ ext4_warning(sb, "not enough memory for %lu groups",
+ gdb_num + 1);
+ goto exit_inode;
+ }
+
+ /*
+ * Finally, we have all of the possible failures behind us...
+ *
+ * Remove new GDT block from inode double-indirect block and clear out
+ * the new GDT block for use (which also "frees" the backup GDT blocks
+ * from the reserved inode). We don't need to change the bitmaps for
+ * these blocks, because they are marked as in-use from being in the
+ * reserved inode, and will become GDT blocks (primary and backup).
+ */
+ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
+ err = ext4_handle_dirty_metadata(handle, NULL, dind);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ goto exit_inode;
+ }
+ inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+ ext4_mark_iloc_dirty(handle, inode, &iloc);
+ memset(gdb_bh->b_data, 0, sb->s_blocksize);
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ goto exit_inode;
+ }
+ brelse(dind);
+
+ o_group_desc = EXT4_SB(sb)->s_group_desc;
+ memcpy(n_group_desc, o_group_desc,
+ EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ n_group_desc[gdb_num] = gdb_bh;
+ EXT4_SB(sb)->s_group_desc = n_group_desc;
+ EXT4_SB(sb)->s_gdb_count++;
+ ext4_kvfree(o_group_desc);
+
+ le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+ if (err)
+ ext4_std_error(sb, err);
+
+ return err;
+
+exit_inode:
+ ext4_kvfree(n_group_desc);
+ /* ext4_handle_release_buffer(handle, iloc.bh); */
+ brelse(iloc.bh);
+exit_dindj:
+ /* ext4_handle_release_buffer(handle, dind); */
+exit_sbh:
+ /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
+exit_dind:
+ brelse(dind);
+exit_bh:
+ brelse(gdb_bh);
+
+ ext4_debug("leaving with error %d\n", err);
+ return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are. We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident. The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+ ext4_group_t group)
+{
+ struct super_block *sb = inode->i_sb;
+ int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ struct buffer_head **primary;
+ struct buffer_head *dind;
+ struct ext4_iloc iloc;
+ ext4_fsblk_t blk;
+ __le32 *data, *end;
+ int gdbackups = 0;
+ int res, i;
+ int err;
+
+ primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
+ if (!primary)
+ return -ENOMEM;
+
+ data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+ dind = sb_bread(sb, le32_to_cpu(*data));
+ if (!dind) {
+ err = -EIO;
+ goto exit_free;
+ }
+
+ blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
+ data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+ EXT4_ADDR_PER_BLOCK(sb));
+ end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
+
+ /* Get each reserved primary GDT block and verify it holds backups */
+ for (res = 0; res < reserved_gdb; res++, blk++) {
+ if (le32_to_cpu(*data) != blk) {
+ ext4_warning(sb, "reserved block %llu"
+ " not at offset %ld",
+ blk,
+ (long)(data - (__le32 *)dind->b_data));
+ err = -EINVAL;
+ goto exit_bh;
+ }
+ primary[res] = sb_bread(sb, blk);
+ if (!primary[res]) {
+ err = -EIO;
+ goto exit_bh;
+ }
+ gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+ if (gdbackups < 0) {
+ brelse(primary[res]);
+ err = gdbackups;
+ goto exit_bh;
+ }
+ if (++data >= end)
+ data = (__le32 *)dind->b_data;
+ }
+
+ for (i = 0; i < reserved_gdb; i++) {
+ if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+ /*
+ int j;
+ for (j = 0; j < i; j++)
+ ext4_handle_release_buffer(handle, primary[j]);
+ */
+ goto exit_bh;
+ }
+ }
+
+ if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+ goto exit_bh;
+
+ /*
+ * Finally we can add each of the reserved backup GDT blocks from
+ * the new group to its reserved primary GDT block.
+ */
+ blk = group * EXT4_BLOCKS_PER_GROUP(sb);
+ for (i = 0; i < reserved_gdb; i++) {
+ int err2;
+ data = (__le32 *)primary[i]->b_data;
+ /* printk("reserving backup %lu[%u] = %lu\n",
+ primary[i]->b_blocknr, gdbackups,
+ blk + primary[i]->b_blocknr); */
+ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+ err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
+ if (!err)
+ err = err2;
+ }
+ inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+ ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+ while (--res >= 0)
+ brelse(primary[res]);
+ brelse(dind);
+
+exit_free:
+ kfree(primary);
+
+ return err;
+}
+
+/*
+ * Update the backup copies of the ext4 metadata. These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem). However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock. The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted. We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time. The resize
+ * which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+ int blk_off, char *data, int size)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const ext4_group_t last = sbi->s_groups_count;
+ const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
+ unsigned three = 1;
+ unsigned five = 5;
+ unsigned seven = 7;
+ ext4_group_t group;
+ int rest = sb->s_blocksize - size;
+ handle_t *handle;
+ int err = 0, err2;
+
+ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ if (IS_ERR(handle)) {
+ group = 1;
+ err = PTR_ERR(handle);
+ goto exit_err;
+ }
+
+ while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+ struct buffer_head *bh;
+
+ /* Out of journal space, and can't get more - abort - so sad */
+ if (ext4_handle_valid(handle) &&
+ handle->h_buffer_credits == 0 &&
+ ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
+ (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ break;
+
+ bh = sb_getblk(sb, group * bpg + blk_off);
+ if (!bh) {
+ err = -EIO;
+ break;
+ }
+ ext4_debug("update metadata backup %#04lx\n",
+ (unsigned long)bh->b_blocknr);
+ if ((err = ext4_journal_get_write_access(handle, bh)))
+ break;
+ lock_buffer(bh);
+ memcpy(bh->b_data, data, size);
+ if (rest)
+ memset(bh->b_data + size, 0, rest);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (unlikely(err))
+ ext4_std_error(sb, err);
+ brelse(bh);
+ }
+ if ((err2 = ext4_journal_stop(handle)) && !err)
+ err = err2;
+
+ /*
+ * Ugh! Need to have e2fsck write the backup copies. It is too
+ * late to revert the resize, we shouldn't fail just because of
+ * the backup copies (they are only needed in case of corruption).
+ *
+ * However, if we got here we have a journal problem too, so we
+ * can't really start a transaction to mark the superblock.
+ * Chicken out and just set the flag on the hope it will be written
+ * to disk, and if not - we will simply wait until next fsck.
+ */
+exit_err:
+ if (err) {
+ ext4_warning(sb, "can't update backup for group %u (err %d), "
+ "forcing fsck on next reboot", group, err);
+ sbi->s_mount_state &= ~EXT4_VALID_FS;
+ sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+ mark_buffer_dirty(sbi->s_sbh);
+ }
+}
+
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+ ext4_group_t group, struct inode *resize_inode,
+ ext4_group_t count)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *gdb_bh;
+ int i, gdb_off, gdb_num, err = 0;
+
+ for (i = 0; i < count; i++, group++) {
+ int reserved_gdb = ext4_bg_has_super(sb, group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
+ if (gdb_off) {
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+
+ if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+ err = reserve_backup_gdb(handle, resize_inode, group);
+ } else
+ err = add_new_gdb(handle, resize_inode, group);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_group_desc *gdp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *gdb_bh;
+ ext4_group_t group;
+ __u16 *bg_flags = flex_gd->bg_flags;
+ int i, gdb_off, gdb_num, err = 0;
+
+
+ for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+ group = group_data->group;
+
+ gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+ gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+ /*
+ * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+ */
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ /* Update group descriptor block for new group */
+ gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+ gdb_off * EXT4_DESC_SIZE(sb));
+
+ memset(gdp, 0, EXT4_DESC_SIZE(sb));
+ ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+ ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+ ext4_inode_table_set(sb, gdp, group_data->inode_table);
+ ext4_free_group_clusters_set(sb, gdp,
+ EXT4_B2C(sbi, group_data->free_blocks_count));
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+ gdp->bg_flags = cpu_to_le16(*bg_flags);
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+
+ err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+ if (unlikely(err)) {
+ ext4_std_error(sb, err);
+ break;
+ }
+
+ /*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ err = ext4_mb_add_groupinfo(sb, group, gdp);
+ if (err)
+ break;
+ }
+ return err;
+}
+
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ ext4_fsblk_t blocks_count = 0;
+ ext4_fsblk_t free_blocks = 0;
+ ext4_fsblk_t reserved_blocks = 0;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i;
+
+ BUG_ON(flex_gd->count == 0 || group_data == NULL);
+ /*
+ * Make the new blocks and inodes valid next. We do this before
+ * increasing the group count so that once the group is enabled,
+ * all of its blocks and inodes are already valid.
+ *
+ * We always allocate group-by-group, then block-by-block or
+ * inode-by-inode within a group, so enabling these
+ * blocks/inodes before the group is live won't actually let us
+ * allocate the new space yet.
+ */
+ for (i = 0; i < flex_gd->count; i++) {
+ blocks_count += group_data[i].blocks_count;
+ free_blocks += group_data[i].free_blocks_count;
+ }
+
+ reserved_blocks = ext4_r_blocks_count(es) * 100;
+ do_div(reserved_blocks, ext4_blocks_count(es));
+ reserved_blocks *= blocks_count;
+ do_div(reserved_blocks, 100);
+
+ ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
+ le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+ le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+ flex_gd->count);
+
+ /*
+ * We need to protect s_groups_count against other CPUs seeing
+ * inconsistent state in the superblock.
+ *
+ * The precise rules we use are:
+ *
+ * * Writers must perform a smp_wmb() after updating all
+ * dependent data and before modifying the groups count
+ *
+ * * Readers must perform an smp_rmb() after reading the groups
+ * count and before reading any dependent data.
+ *
+ * NB. These rules can be relaxed when checking the group count
+ * while freeing data, as we can only allocate from a block
+ * group after serialising against the group count, and we can
+ * only then free after serialising in turn against that
+ * allocation.
+ */
+ smp_wmb();
+
+ /* Update the global fs size fields */
+ sbi->s_groups_count += flex_gd->count;
+
+ /* Update the reserved block counts only once the new group is
+ * active. */
+ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+ reserved_blocks);
+
+ /* Update the free space counts */
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_B2C(sbi, free_blocks));
+ percpu_counter_add(&sbi->s_freeinodes_counter,
+ EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, group_data[0].group);
+ atomic_add(EXT4_B2C(sbi, free_blocks),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ }
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: added group %u:"
+ "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+ blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+ struct inode *resize_inode,
+ struct ext4_new_flex_group_data *flex_gd)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t o_blocks_count;
+ ext4_grpblk_t last;
+ ext4_group_t group;
+ handle_t *handle;
+ unsigned reserved_gdb;
+ int err = 0, err2 = 0, credit;
+
+ BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+ reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+ o_blocks_count = ext4_blocks_count(es);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+
+ err = setup_new_flex_group_blocks(sb, flex_gd);
+ if (err)
+ goto exit;
+ /*
+ * We will always be modifying at least the superblock and GDT
+ * block. If we are adding a group past the last current GDT block,
+ * we will also modify the inode and the dindirect block. If we
+ * are adding a group with superblock/GDT backups we will also
+ * modify each of the reserved GDT dindirect blocks.
+ */
+ credit = flex_gd->count * 4 + reserved_gdb;
+ handle = ext4_journal_start_sb(sb, credit);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto exit;
+ }
+
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto exit_journal;
+
+ group = flex_gd->groups[0].group;
+ BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+ err = ext4_add_new_descs(handle, sb, group,
+ resize_inode, flex_gd->count);
+ if (err)
+ goto exit_journal;
+
+ err = ext4_setup_new_descs(handle, sb, flex_gd);
+ if (err)
+ goto exit_journal;
+
+ ext4_update_super(sb, flex_gd);
+
+ err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+ err2 = ext4_journal_stop(handle);
+ if (!err)
+ err = err2;
+
+ if (!err) {
+ int i;
+ update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block));
+ for (i = 0; i < flex_gd->count; i++, group++) {
+ struct buffer_head *gdb_bh;
+ int gdb_num;
+ gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+ gdb_bh = sbi->s_group_desc[gdb_num];
+ update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+ gdb_bh->b_size);
+ }
+ }
+exit:
+ return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+ struct ext4_new_flex_group_data *flex_gd,
+ ext4_fsblk_t n_blocks_count,
+ unsigned long flexbg_size)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_new_group_data *group_data = flex_gd->groups;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t n_group;
+ ext4_group_t group;
+ ext4_group_t last_group;
+ ext4_grpblk_t last;
+ ext4_grpblk_t blocks_per_group;
+ unsigned long i;
+
+ blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (o_blocks_count == n_blocks_count)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+ BUG_ON(last);
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+ last_group = group | (flexbg_size - 1);
+ if (last_group > n_group)
+ last_group = n_group;
+
+ flex_gd->count = last_group - group + 1;
+
+ for (i = 0; i < flex_gd->count; i++) {
+ int overhead;
+
+ group_data[i].group = group + i;
+ group_data[i].blocks_count = blocks_per_group;
+ overhead = ext4_bg_has_super(sb, group + i) ?
+ (1 + ext4_bg_num_gdb(sb, group + i) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ group_data[i].free_blocks_count = blocks_per_group - overhead;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+ EXT4_BG_INODE_UNINIT;
+ else
+ flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+ }
+
+ if (last_group == n_group &&
+ EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ /* We need to initialize block bitmap of last group. */
+ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+ if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+ group_data[i - 1].blocks_count = last + 1;
+ group_data[i - 1].free_blocks_count -= blocks_per_group-
+ last - 1;
+ }
+
+ return 1;
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock. Prior to that we have
+ * not really "added" the group at all. We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+ struct ext4_new_flex_group_data flex_gd;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+ struct inode *inode = NULL;
+ int gdb_off, gdb_num;
+ int err;
+ __u16 bg_flags = 0;
+
+ gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+ gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+ if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ ext4_warning(sb, "Can't resize non-sparse filesystem further");
+ return -EPERM;
+ }
+
+ if (ext4_blocks_count(es) + input->blocks_count <
+ ext4_blocks_count(es)) {
+ ext4_warning(sb, "blocks_count overflow");
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_warning(sb, "inodes_count overflow");
+ return -EINVAL;
+ }
+
+ if (reserved_gdb || gdb_off == 0) {
+ if (!EXT4_HAS_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_COMPAT_RESIZE_INODE)
+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ ext4_warning(sb,
+ "No reserved GDT blocks, can't resize");
+ return -EPERM;
+ }
+ inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(inode);
+ }
+ }
+
+
+ err = verify_group_input(sb, input);
+ if (err)
+ goto out;
+
+ flex_gd.count = 1;
+ flex_gd.groups = input;
+ flex_gd.bg_flags = &bg_flags;
+ err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+ iput(inode);
+ return err;
+} /* ext4_group_add */
+
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+ ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ handle_t *handle;
+ int err = 0, err2;
+
+ /* We will update the superblock, one block bitmap, and
+ * one group descriptor via ext4_group_add_blocks().
+ */
+ handle = ext4_journal_start_sb(sb, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ ext4_warning(sb, "error %d on journal start", err);
+ return err;
+ }
+
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err) {
+ ext4_warning(sb, "error %d on journal write access", err);
+ goto errout;
+ }
+
+ ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+ ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
+ if (err)
+ goto errout;
+ ext4_handle_dirty_super(handle, sb);
+ ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+errout:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+
+ if (!err) {
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+ "blocks\n", ext4_blocks_count(es));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block));
+ }
+ return err;
+}
+
+/*
+ * Extend the filesystem to the new number of blocks specified. This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext4_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
+ ext4_fsblk_t n_blocks_count)
+{
+ ext4_fsblk_t o_blocks_count;
+ ext4_grpblk_t last;
+ ext4_grpblk_t add;
+ struct buffer_head *bh;
+ int err;
+ ext4_group_t group;
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG,
+ "extending last group from %llu to %llu blocks",
+ o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+ return 0;
+
+ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ ext4_msg(sb, KERN_ERR,
+ "filesystem too large to resize to %llu blocks safely",
+ n_blocks_count);
+ if (sizeof(sector_t) < 8)
+ ext4_warning(sb, "CONFIG_LBDAF not enabled");
+ return -EINVAL;
+ }
+
+ if (n_blocks_count < o_blocks_count) {
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
+ }
+
+ /* Handle the remaining blocks in the last group only. */
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+
+ if (last == 0) {
+ ext4_warning(sb, "need to use ext2online to resize further");
+ return -EPERM;
+ }
+
+ add = EXT4_BLOCKS_PER_GROUP(sb) - last;
+
+ if (o_blocks_count + add < o_blocks_count) {
+ ext4_warning(sb, "blocks_count overflow");
+ return -EINVAL;
+ }
+
+ if (o_blocks_count + add > n_blocks_count)
+ add = n_blocks_count - o_blocks_count;
+
+ if (o_blocks_count + add < n_blocks_count)
+ ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
+ o_blocks_count + add, add);
+
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, o_blocks_count + add - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ return err;
+} /* ext4_group_extend */
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+ struct ext4_new_flex_group_data *flex_gd = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head *bh;
+ struct inode *resize_inode;
+ ext4_fsblk_t o_blocks_count;
+ ext4_group_t o_group;
+ ext4_group_t n_group;
+ ext4_grpblk_t offset, add;
+ unsigned long n_desc_blocks;
+ unsigned long o_desc_blocks;
+ unsigned long desc_blocks;
+ int err = 0, flexbg_size = 1;
+
+ o_blocks_count = ext4_blocks_count(es);
+
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
+ "to %llu blocks", o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count < o_blocks_count) {
+ /* On-line shrinking not supported */
+ ext4_warning(sb, "can't shrink FS - resize aborted");
+ return -EINVAL;
+ }
+
+ if (n_blocks_count == o_blocks_count)
+ /* Nothing need to do */
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+ ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
+
+ n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+ EXT4_DESC_PER_BLOCK(sb);
+ o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ desc_blocks = n_desc_blocks - o_desc_blocks;
+
+ if (desc_blocks &&
+ (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+ le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+ ext4_warning(sb, "No reserved GDT blocks, can't resize");
+ return -EPERM;
+ }
+
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ if (IS_ERR(resize_inode)) {
+ ext4_warning(sb, "Error opening resize inode");
+ return PTR_ERR(resize_inode);
+ }
+
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, n_blocks_count - 1);
+ if (!bh) {
+ ext4_warning(sb, "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+ /* extend the last group */
+ if (n_group == o_group)
+ add = n_blocks_count - o_blocks_count;
+ else
+ add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+ if (add > 0) {
+ err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+ if (err)
+ goto out;
+ }
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ es->s_log_groups_per_flex)
+ flexbg_size = 1 << es->s_log_groups_per_flex;
+
+ o_blocks_count = ext4_blocks_count(es);
+ if (o_blocks_count == n_blocks_count)
+ goto out;
+
+ flex_gd = alloc_flex_gd(flexbg_size);
+ if (flex_gd == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Add flex groups. Note that a regular group is a
+ * flex group with 1 group.
+ */
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+ flexbg_size)) {
+ ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+ err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+ if (unlikely(err))
+ break;
+ }
+
+out:
+ if (flex_gd)
+ free_flex_gd(flex_gd);
+
+ iput(resize_inode);
+ if (test_opt(sb, DEBUG))
+ ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
+ "upto %llu blocks", o_blocks_count, n_blocks_count);
+ return err;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 00000000..a68703a5
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,4980 @@
+/*
+ * linux/fs/ext4/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/vmalloc.h>
+#include <linux/jbd2.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/log2.h>
+#include <linux/crc16.h>
+#include <linux/cleancache.h>
+#include <asm/uaccess.h>
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "mballoc.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
+static struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
+
+static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+ unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
+static int ext4_commit_super(struct super_block *sb, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
+static int ext4_sync_fs(struct super_block *sb, int wait);
+static const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16]);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int ext4_unfreeze(struct super_block *sb);
+static void ext4_write_super(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext3",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
+
+void *ext4_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+void *ext4_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
+}
+
+void ext4_kvfree(void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+
+}
+
+ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le32_to_cpu(bg->bg_block_bitmap_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le32_to_cpu(bg->bg_inode_bitmap_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le32_to_cpu(bg->bg_inode_table_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+__u32 ext4_free_group_clusters(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_itable_unused_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
+void ext4_block_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+ bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+ bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_table_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+ bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_free_group_clusters_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
+ */
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+{
+ journal_t *journal;
+ handle_t *handle;
+
+ trace_ext4_journal_start(sb, nblocks, _RET_IP_);
+ if (sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+
+ journal = EXT4_SB(sb)->s_journal;
+ handle = ext4_journal_current_handle();
+
+ /*
+ * If a handle has been started, it should be allowed to
+ * finish, otherwise deadlock could happen between freeze
+ * and others(e.g. truncate) due to the restart of the
+ * journal handle if the filesystem is forzen and active
+ * handles are not stopped.
+ */
+ if (!handle)
+ vfs_check_frozen(sb, SB_FREEZE_TRANS);
+
+ if (!journal)
+ return ext4_get_nojournal();
+ /*
+ * Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly.
+ */
+ if (is_journal_aborted(journal)) {
+ ext4_abort(sb, "Detected aborted journal");
+ return ERR_PTR(-EROFS);
+ }
+ return jbd2_journal_start(journal, nblocks);
+}
+
+/*
+ * The only special thing we need to do here is to make sure that all
+ * jbd2_journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+ struct super_block *sb;
+ int err;
+ int rc;
+
+ if (!ext4_handle_valid(handle)) {
+ ext4_put_nojournal(handle);
+ return 0;
+ }
+ sb = handle->h_transaction->t_journal->j_private;
+ err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
+ err = rc;
+ if (err)
+ __ext4_std_error(sb, where, line, err);
+ return err;
+}
+
+void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn, struct buffer_head *bh,
+ handle_t *handle, int err)
+{
+ char nbuf[16];
+ const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+ BUG_ON(!ext4_handle_valid(handle));
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+
+ if (!handle->h_err)
+ handle->h_err = err;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+ caller, line, errstr, err_fn);
+
+ jbd2_journal_abort_handle(handle);
+}
+
+static void __save_error_info(struct super_block *sb, const char *func,
+ unsigned int line)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ es->s_last_error_time = cpu_to_le32(get_seconds());
+ strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+ es->s_last_error_line = cpu_to_le32(line);
+ if (!es->s_first_error_time) {
+ es->s_first_error_time = es->s_last_error_time;
+ strncpy(es->s_first_error_func, func,
+ sizeof(es->s_first_error_func));
+ es->s_first_error_line = cpu_to_le32(line);
+ es->s_first_error_ino = es->s_last_error_ino;
+ es->s_first_error_block = es->s_last_error_block;
+ }
+ /*
+ * Start the daily error reporting function if it hasn't been
+ * started already
+ */
+ if (!es->s_error_count)
+ mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+ es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
+}
+
+static void save_error_info(struct super_block *sb, const char *func,
+ unsigned int line)
+{
+ __save_error_info(sb, func, line);
+ ext4_commit_super(sb, 1);
+}
+
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else. Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+ struct inode *bd_inode = sb->s_bdev->bd_inode;
+ struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+
+ return bdi->dev == NULL;
+}
+
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int error = is_journal_aborted(journal);
+ struct ext4_journal_cb_entry *jce, *tmp;
+
+ spin_lock(&sbi->s_md_lock);
+ list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+ list_del_init(&jce->jce_list);
+ spin_unlock(&sbi->s_md_lock);
+ jce->jce_func(sb, jce, error);
+ spin_lock(&sbi->s_md_lock);
+ }
+ spin_unlock(&sbi->s_md_lock);
+}
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock. That is not possible on ext4, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the jbd2_journal_abort() error code to record an error in
+ * the journal instead. On recovery, the journal will complain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext4_handle_error(struct super_block *sb)
+{
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ if (!test_opt(sb, ERRORS_CONT)) {
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ if (journal)
+ jbd2_journal_abort(journal, -EIO);
+ }
+ if (test_opt(sb, ERRORS_RO)) {
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ sb->s_flags |= MS_RDONLY;
+ }
+ if (test_opt(sb, ERRORS_PANIC))
+ panic("EXT4-fs (device %s): panic forced after error\n",
+ sb->s_id);
+}
+
+void __ext4_error(struct super_block *sb, const char *function,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+ sb->s_id, function, line, current->comm, &vaf);
+ va_end(args);
+ save_error_info(sb, function, line);
+
+ ext4_handle_error(sb);
+}
+
+void ext4_error_inode(struct inode *inode, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct va_format vaf;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+ es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ es->s_last_error_block = cpu_to_le64(block);
+ save_error_info(inode->i_sb, function, line);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: block %llu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, &vaf);
+ else
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
+ va_end(args);
+
+ ext4_handle_error(inode->i_sb);
+}
+
+void ext4_error_file(struct file *file, const char *function,
+ unsigned int line, ext4_fsblk_t block,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct va_format vaf;
+ struct ext4_super_block *es;
+ struct inode *inode = file->f_dentry->d_inode;
+ char pathname[80], *path;
+
+ es = EXT4_SB(inode->i_sb)->s_es;
+ es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ save_error_info(inode->i_sb, function, line);
+ path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "block %llu: comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, path, &vaf);
+ else
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, path, &vaf);
+ va_end(args);
+
+ ext4_handle_error(inode->i_sb);
+}
+
+static const char *ext4_decode_error(struct super_block *sb, int errno,
+ char nbuf[16])
+{
+ char *errstr = NULL;
+
+ switch (errno) {
+ case -EIO:
+ errstr = "IO failure";
+ break;
+ case -ENOMEM:
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
+ errstr = "Journal has aborted";
+ else
+ errstr = "Readonly filesystem";
+ break;
+ default:
+ /* If the caller passed in an extra buffer for unknown
+ * errors, textualise them now. Else we just return
+ * NULL. */
+ if (nbuf) {
+ /* Check for truncated error codes... */
+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+ errstr = nbuf;
+ }
+ break;
+ }
+
+ return errstr;
+}
+
+/* __ext4_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response. */
+
+void __ext4_std_error(struct super_block *sb, const char *function,
+ unsigned int line, int errno)
+{
+ char nbuf[16];
+ const char *errstr;
+
+ /* Special case: if the error is EROFS, and we're not already
+ * inside a transaction, then there's really no point in logging
+ * an error. */
+ if (errno == -EROFS && journal_current_handle() == NULL &&
+ (sb->s_flags & MS_RDONLY))
+ return;
+
+ errstr = ext4_decode_error(sb, errno, nbuf);
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ save_error_info(sb, function, line);
+
+ ext4_handle_error(sb);
+}
+
+/*
+ * ext4_abort is a much stronger failure handler than ext4_error. The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void __ext4_abort(struct super_block *sb, const char *function,
+ unsigned int line, const char *fmt, ...)
+{
+ va_list args;
+
+ save_error_info(sb, function, line);
+ va_start(args, fmt);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+ function, line);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ if ((sb->s_flags & MS_RDONLY) == 0) {
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+ sb->s_flags |= MS_RDONLY;
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+ save_error_info(sb, function, line);
+ }
+ if (test_opt(sb, ERRORS_PANIC))
+ panic("EXT4-fs panic from previous error\n");
+}
+
+void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+ va_end(args);
+}
+
+void __ext4_warning(struct super_block *sb, const char *function,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+ sb->s_id, function, line, &vaf);
+ va_end(args);
+}
+
+void __ext4_grp_locked_error(const char *function, unsigned int line,
+ struct super_block *sb, ext4_group_t grp,
+ unsigned long ino, ext4_fsblk_t block,
+ const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+ struct va_format vaf;
+ va_list args;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ es->s_last_error_ino = cpu_to_le32(ino);
+ es->s_last_error_block = cpu_to_le64(block);
+ __save_error_info(sb, function, line);
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+ sb->s_id, function, line, grp);
+ if (ino)
+ printk(KERN_CONT "inode %lu: ", ino);
+ if (block)
+ printk(KERN_CONT "block %llu:", (unsigned long long) block);
+ printk(KERN_CONT "%pV\n", &vaf);
+ va_end(args);
+
+ if (test_opt(sb, ERRORS_CONT)) {
+ ext4_commit_super(sb, 0);
+ return;
+ }
+
+ ext4_unlock_group(sb, grp);
+ ext4_handle_error(sb);
+ /*
+ * We only get here in the ERRORS_RO case; relocking the group
+ * may be dangerous, but nothing bad will happen since the
+ * filesystem will have already been marked read/only and the
+ * journal has been aborted. We return 1 as a hint to callers
+ * who might what to use the return value from
+ * ext4_grp_locked_error() to distinguish between the
+ * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+ * aggressively from the ext4 function in question, with a
+ * more appropriate error code.
+ */
+ ext4_lock_group(sb, grp);
+ return;
+}
+
+void ext4_update_dynamic_rev(struct super_block *sb)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
+ return;
+
+ ext4_warning(sb,
+ "updating to rev %d because of new feature flag, "
+ "running e2fsck is recommended",
+ EXT4_DYNAMIC_REV);
+
+ es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
+ es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
+ es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
+ /* leave es->s_feature_*compat flags alone */
+ /* es->s_uuid will be set by e2fsck if empty */
+
+ /*
+ * The rest of the superblock fields should be zero, and if not it
+ * means they are likely already in use, so leave them alone. We
+ * can leave it up to e2fsck to clean up any inconsistencies there.
+ */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
+{
+ struct block_device *bdev;
+ char b[BDEVNAME_SIZE];
+
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+ if (IS_ERR(bdev))
+ goto fail;
+ return bdev;
+
+fail:
+ ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
+ __bdevname(dev, b), PTR_ERR(bdev));
+ return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static int ext4_blkdev_put(struct block_device *bdev)
+{
+ return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+{
+ struct block_device *bdev;
+ int ret = -ENODEV;
+
+ bdev = sbi->journal_bdev;
+ if (bdev) {
+ ret = ext4_blkdev_put(bdev);
+ sbi->journal_bdev = NULL;
+ }
+ return ret;
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+ return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
+{
+ struct list_head *l;
+
+ ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+ le32_to_cpu(sbi->s_es->s_last_orphan));
+
+ printk(KERN_ERR "sb_info orphan list:\n");
+ list_for_each(l, &sbi->s_orphan) {
+ struct inode *inode = orphan_list_entry(l);
+ printk(KERN_ERR " "
+ "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+ inode->i_sb->s_id, inode->i_ino, inode,
+ inode->i_mode, inode->i_nlink,
+ NEXT_ORPHAN(inode));
+ }
+}
+
+static void ext4_put_super(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i, err;
+
+ ext4_unregister_li_request(sb);
+ dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
+ lock_super(sb);
+ if (sbi->s_journal) {
+ err = jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ if (err < 0)
+ ext4_abort(sb, "Couldn't clean up the journal");
+ }
+
+ del_timer(&sbi->s_err_report);
+ ext4_release_system_zone(sb);
+ ext4_mb_release(sb);
+ ext4_ext_release(sb);
+ ext4_xattr_put_super(sb);
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+ }
+ if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
+ ext4_commit_super(sb, 1);
+
+ if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(sbi->s_group_desc[i]);
+ ext4_kvfree(sbi->s_group_desc);
+ ext4_kvfree(sbi->s_flex_groups);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+#endif
+
+ /* Debugging code just in case the in-memory inode orphan list
+ * isn't empty. The on-disk one can be non-empty if we've
+ * detected an error and taken the fs readonly, but the
+ * in-memory list had better be clean by this point. */
+ if (!list_empty(&sbi->s_orphan))
+ dump_orphan_list(sb, sbi);
+ J_ASSERT(list_empty(&sbi->s_orphan));
+
+ invalidate_bdev(sb->s_bdev);
+ if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ sync_blockdev(sbi->journal_bdev);
+ invalidate_bdev(sbi->journal_bdev);
+ ext4_blkdev_remove(sbi);
+ }
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
+ sb->s_fs_info = NULL;
+ /*
+ * Now that we are completely done shutting down the
+ * superblock, we need to actually destroy the kobject.
+ */
+ unlock_super(sb);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ kfree(sbi->s_blockgroup_lock);
+ kfree(sbi);
+}
+
+static struct kmem_cache *ext4_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext4_alloc_inode(struct super_block *sb)
+{
+ struct ext4_inode_info *ei;
+
+ ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+ if (!ei)
+ return NULL;
+
+ ei->vfs_inode.i_version = 1;
+ ei->vfs_inode.i_data.writeback_index = 0;
+ memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+ INIT_LIST_HEAD(&ei->i_prealloc_list);
+ spin_lock_init(&ei->i_prealloc_lock);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
+ ei->jinode = NULL;
+ INIT_LIST_HEAD(&ei->i_completed_io_list);
+ spin_lock_init(&ei->i_completed_io_lock);
+ ei->cur_aio_dio = NULL;
+ ei->i_sync_tid = 0;
+ ei->i_datasync_tid = 0;
+ atomic_set(&ei->i_ioend_count, 0);
+ atomic_set(&ei->i_aiodio_unwritten, 0);
+
+ return &ei->vfs_inode;
+}
+
+static int ext4_drop_inode(struct inode *inode)
+{
+ int drop = generic_drop_inode(inode);
+
+ trace_ext4_drop_inode(inode, drop);
+ return drop;
+}
+
+static void ext4_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
+static void ext4_destroy_inode(struct inode *inode)
+{
+ if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): orphan list check failed!",
+ inode->i_ino, EXT4_I(inode));
+ print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
+ EXT4_I(inode), sizeof(struct ext4_inode_info),
+ true);
+ dump_stack();
+ }
+ call_rcu(&inode->i_rcu, ext4_i_callback);
+}
+
+static void init_once(void *foo)
+{
+ struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+
+ INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT4_FS_XATTR
+ init_rwsem(&ei->xattr_sem);
+#endif
+ init_rwsem(&ei->i_data_sem);
+ inode_init_once(&ei->vfs_inode);
+}
+
+static int init_inodecache(void)
+{
+ ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+ sizeof(struct ext4_inode_info),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once);
+ if (ext4_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(ext4_inode_cachep);
+}
+
+void ext4_clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+ end_writeback(inode);
+ dquot_drop(inode);
+ ext4_discard_preallocations(inode);
+ if (EXT4_I(inode)->jinode) {
+ jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+ EXT4_I(inode)->jinode);
+ jbd2_free_inode(EXT4_I(inode)->jinode);
+ EXT4_I(inode)->jinode = NULL;
+ }
+}
+
+static struct inode *ext4_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct inode *inode;
+
+ if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+ return ERR_PTR(-ESTALE);
+ if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ return ERR_PTR(-ESTALE);
+
+ /* iget isn't really right if the inode is currently unallocated!!
+ *
+ * ext4_read_inode will return a bad_inode if the inode had been
+ * deleted, so we should be safe.
+ *
+ * Currently we don't know the generation for parent directory, so
+ * a generation of 0 means "accept any"
+ */
+ inode = ext4_iget(sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (generation && inode->i_generation != generation) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+
+ return inode;
+}
+
+static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ ext4_nfs_get_inode);
+}
+
+static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ ext4_nfs_get_inode);
+}
+
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device. Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+ gfp_t wait)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ if (journal)
+ return jbd2_journal_try_to_free_buffers(journal, page,
+ wait & ~__GFP_WAIT);
+ return try_to_free_buffers(page);
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext4_write_dquot(struct dquot *dquot);
+static int ext4_acquire_dquot(struct dquot *dquot);
+static int ext4_release_dquot(struct dquot *dquot);
+static int ext4_mark_dquot_dirty(struct dquot *dquot);
+static int ext4_write_info(struct super_block *sb, int type);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+ struct path *path);
+static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+
+static const struct dquot_operations ext4_quota_operations = {
+ .get_reserved_space = ext4_get_reserved_space,
+ .write_dquot = ext4_write_dquot,
+ .acquire_dquot = ext4_acquire_dquot,
+ .release_dquot = ext4_release_dquot,
+ .mark_dirty = ext4_mark_dquot_dirty,
+ .write_info = ext4_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
+};
+
+static const struct quotactl_ops ext4_qctl_operations = {
+ .quota_on = ext4_quota_on,
+ .quota_off = ext4_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
+};
+#endif
+
+static const struct super_operations ext4_sops = {
+ .alloc_inode = ext4_alloc_inode,
+ .destroy_inode = ext4_destroy_inode,
+ .write_inode = ext4_write_inode,
+ .dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
+ .evict_inode = ext4_evict_inode,
+ .put_super = ext4_put_super,
+ .sync_fs = ext4_sync_fs,
+ .freeze_fs = ext4_freeze,
+ .unfreeze_fs = ext4_unfreeze,
+ .statfs = ext4_statfs,
+ .remount_fs = ext4_remount,
+ .show_options = ext4_show_options,
+#ifdef CONFIG_QUOTA
+ .quota_read = ext4_quota_read,
+ .quota_write = ext4_quota_write,
+#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct super_operations ext4_nojournal_sops = {
+ .alloc_inode = ext4_alloc_inode,
+ .destroy_inode = ext4_destroy_inode,
+ .write_inode = ext4_write_inode,
+ .dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
+ .evict_inode = ext4_evict_inode,
+ .write_super = ext4_write_super,
+ .put_super = ext4_put_super,
+ .statfs = ext4_statfs,
+ .remount_fs = ext4_remount,
+ .show_options = ext4_show_options,
+#ifdef CONFIG_QUOTA
+ .quota_read = ext4_quota_read,
+ .quota_write = ext4_quota_write,
+#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct export_operations ext4_export_ops = {
+ .fh_to_dentry = ext4_fh_to_dentry,
+ .fh_to_parent = ext4_fh_to_parent,
+ .get_parent = ext4_get_parent,
+};
+
+enum {
+ Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+ Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_nouid32, Opt_debug, Opt_removed,
+ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
+ Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+ Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_dioread_nolock, Opt_dioread_lock,
+ Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+};
+
+static const match_table_t tokens = {
+ {Opt_bsd_df, "bsddf"},
+ {Opt_minix_df, "minixdf"},
+ {Opt_grpid, "grpid"},
+ {Opt_grpid, "bsdgroups"},
+ {Opt_nogrpid, "nogrpid"},
+ {Opt_nogrpid, "sysvgroups"},
+ {Opt_resgid, "resgid=%u"},
+ {Opt_resuid, "resuid=%u"},
+ {Opt_sb, "sb=%u"},
+ {Opt_err_cont, "errors=continue"},
+ {Opt_err_panic, "errors=panic"},
+ {Opt_err_ro, "errors=remount-ro"},
+ {Opt_nouid32, "nouid32"},
+ {Opt_debug, "debug"},
+ {Opt_removed, "oldalloc"},
+ {Opt_removed, "orlov"},
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_noload, "norecovery"},
+ {Opt_noload, "noload"},
+ {Opt_removed, "nobh"},
+ {Opt_removed, "bh"},
+ {Opt_commit, "commit=%u"},
+ {Opt_min_batch_time, "min_batch_time=%u"},
+ {Opt_max_batch_time, "max_batch_time=%u"},
+ {Opt_journal_dev, "journal_dev=%u"},
+ {Opt_journal_checksum, "journal_checksum"},
+ {Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_abort, "abort"},
+ {Opt_data_journal, "data=journal"},
+ {Opt_data_ordered, "data=ordered"},
+ {Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
+ {Opt_offusrjquota, "usrjquota="},
+ {Opt_usrjquota, "usrjquota=%s"},
+ {Opt_offgrpjquota, "grpjquota="},
+ {Opt_grpjquota, "grpjquota=%s"},
+ {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+ {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
+ {Opt_grpquota, "grpquota"},
+ {Opt_noquota, "noquota"},
+ {Opt_quota, "quota"},
+ {Opt_usrquota, "usrquota"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_i_version, "i_version"},
+ {Opt_stripe, "stripe=%u"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
+ {Opt_mblk_io_submit, "mblk_io_submit"},
+ {Opt_nomblk_io_submit, "nomblk_io_submit"},
+ {Opt_block_validity, "block_validity"},
+ {Opt_noblock_validity, "noblock_validity"},
+ {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+ {Opt_journal_ioprio, "journal_ioprio=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_dioread_nolock, "dioread_nolock"},
+ {Opt_dioread_lock, "dioread_lock"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
+ {Opt_init_itable, "init_itable=%u"},
+ {Opt_init_itable, "init_itable"},
+ {Opt_noinit_itable, "noinit_itable"},
+ {Opt_removed, "check=none"}, /* mount option from ext2/3 */
+ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
+ {Opt_removed, "reservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+ {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */
+ {Opt_err, NULL},
+};
+
+static ext4_fsblk_t get_sb_block(void **data)
+{
+ ext4_fsblk_t sb_block;
+ char *options = (char *) *data;
+
+ if (!options || strncmp(options, "sb=", 3) != 0)
+ return 1; /* Default location */
+
+ options += 3;
+ /* TODO: use simple_strtoll with >32bit ext4 */
+ sb_block = simple_strtoul(options, &options, 0);
+ if (*options && *options != ',') {
+ printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
+ (char *) *data);
+ return 1;
+ }
+ if (*options == ',')
+ options++;
+ *data = (void *) options;
+
+ return sb_block;
+}
+
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+ "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char *qname;
+
+ if (sb_any_quota_loaded(sb) &&
+ !sbi->s_qf_names[qtype]) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return -1;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ ext4_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return -1;
+ }
+ if (sbi->s_qf_names[qtype] &&
+ strcmp(sbi->s_qf_names[qtype], qname)) {
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified", QTYPE2NAME(qtype));
+ kfree(qname);
+ return -1;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ if (strchr(sbi->s_qf_names[qtype], '/')) {
+ ext4_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return -1;
+ }
+ set_opt(sb, QUOTA);
+ return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_qf_names[qtype]) {
+ ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return -1;
+ }
+ /*
+ * The space will be released later when all options are confirmed
+ * to be correct
+ */
+ sbi->s_qf_names[qtype] = NULL;
+ return 1;
+}
+#endif
+
+#define MOPT_SET 0x0001
+#define MOPT_CLEAR 0x0002
+#define MOPT_NOSUPPORT 0x0004
+#define MOPT_EXPLICIT 0x0008
+#define MOPT_CLEAR_ERR 0x0010
+#define MOPT_GTE0 0x0020
+#ifdef CONFIG_QUOTA
+#define MOPT_Q 0
+#define MOPT_QFMT 0x0040
+#else
+#define MOPT_Q MOPT_NOSUPPORT
+#define MOPT_QFMT MOPT_NOSUPPORT
+#endif
+#define MOPT_DATAJ 0x0080
+
+static const struct mount_opts {
+ int token;
+ int mount_opt;
+ int flags;
+} ext4_mount_opts[] = {
+ {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+ {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+ {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+ {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+ {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
+ {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
+ {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+ {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+ {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
+ {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
+ {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+ {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+ {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
+ {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
+ {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
+ {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+ EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
+ {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
+ {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+ {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
+ {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
+ {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+ {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+ {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+ {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+ {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_GTE0},
+ {Opt_max_batch_time, 0, MOPT_GTE0},
+ {Opt_min_batch_time, 0, MOPT_GTE0},
+ {Opt_inode_readahead_blks, 0, MOPT_GTE0},
+ {Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_stripe, 0, MOPT_GTE0},
+ {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
+ {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
+ {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
+#ifdef CONFIG_EXT4_FS_XATTR
+ {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+ {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#else
+ {Opt_user_xattr, 0, MOPT_NOSUPPORT},
+ {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+ {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
+#else
+ {Opt_acl, 0, MOPT_NOSUPPORT},
+ {Opt_noacl, 0, MOPT_NOSUPPORT},
+#endif
+ {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+ {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+ {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+ {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+ MOPT_SET | MOPT_Q},
+ {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+ EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+ {Opt_usrjquota, 0, MOPT_Q},
+ {Opt_grpjquota, 0, MOPT_Q},
+ {Opt_offusrjquota, 0, MOPT_Q},
+ {Opt_offgrpjquota, 0, MOPT_Q},
+ {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+ {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+ {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+ {Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+ substring_t *args, unsigned long *journal_devnum,
+ unsigned int *journal_ioprio, int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const struct mount_opts *m;
+ int arg = 0;
+
+#ifdef CONFIG_QUOTA
+ if (token == Opt_usrjquota)
+ return set_qf_name(sb, USRQUOTA, &args[0]);
+ else if (token == Opt_grpjquota)
+ return set_qf_name(sb, GRPQUOTA, &args[0]);
+ else if (token == Opt_offusrjquota)
+ return clear_qf_name(sb, USRQUOTA);
+ else if (token == Opt_offgrpjquota)
+ return clear_qf_name(sb, GRPQUOTA);
+#endif
+ if (args->from && match_int(args, &arg))
+ return -1;
+ switch (token) {
+ case Opt_noacl:
+ case Opt_nouser_xattr:
+ ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+ break;
+ case Opt_sb:
+ return 1; /* handled by get_sb_block() */
+ case Opt_removed:
+ ext4_msg(sb, KERN_WARNING,
+ "Ignoring removed %s option", opt);
+ return 1;
+ case Opt_resuid:
+ sbi->s_resuid = arg;
+ return 1;
+ case Opt_resgid:
+ sbi->s_resgid = arg;
+ return 1;
+ case Opt_abort:
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ return 1;
+ case Opt_i_version:
+ sb->s_flags |= MS_I_VERSION;
+ return 1;
+ case Opt_journal_dev:
+ if (is_remount) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
+ return -1;
+ }
+ *journal_devnum = arg;
+ return 1;
+ case Opt_journal_ioprio:
+ if (arg < 0 || arg > 7)
+ return -1;
+ *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ return 1;
+ }
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+ if (token != m->token)
+ continue;
+ if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+ return -1;
+ if (m->flags & MOPT_EXPLICIT)
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_CLEAR_ERR)
+ clear_opt(sb, ERRORS_MASK);
+ if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
+ return -1;
+ }
+
+ if (m->flags & MOPT_NOSUPPORT) {
+ ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+ } else if (token == Opt_commit) {
+ if (arg == 0)
+ arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * arg;
+ } else if (token == Opt_max_batch_time) {
+ if (arg == 0)
+ arg = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_max_batch_time = arg;
+ } else if (token == Opt_min_batch_time) {
+ sbi->s_min_batch_time = arg;
+ } else if (token == Opt_inode_readahead_blks) {
+ if (arg > (1 << 30))
+ return -1;
+ if (arg && !is_power_of_2(arg)) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: inode_readahead_blks"
+ " must be a power of 2");
+ return -1;
+ }
+ sbi->s_inode_readahead_blks = arg;
+ } else if (token == Opt_init_itable) {
+ set_opt(sb, INIT_INODE_TABLE);
+ if (!args->from)
+ arg = EXT4_DEF_LI_WAIT_MULT;
+ sbi->s_li_wait_mult = arg;
+ } else if (token == Opt_stripe) {
+ sbi->s_stripe = arg;
+ } else if (m->flags & MOPT_DATAJ) {
+ if (is_remount) {
+ if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+ else if (test_opt(sb, DATA_FLAGS) !=
+ m->mount_opt) {
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change data mode on remount");
+ return -1;
+ }
+ } else {
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_mount_opt |= m->mount_opt;
+ }
+#ifdef CONFIG_QUOTA
+ } else if (m->flags & MOPT_QFMT) {
+ if (sb_any_quota_loaded(sb) &&
+ sbi->s_jquota_fmt != m->mount_opt) {
+ ext4_msg(sb, KERN_ERR, "Cannot "
+ "change journaled quota options "
+ "when quota turned on");
+ return -1;
+ }
+ sbi->s_jquota_fmt = m->mount_opt;
+#endif
+ } else {
+ if (!args->from)
+ arg = 1;
+ if (m->flags & MOPT_CLEAR)
+ arg = !arg;
+ else if (unlikely(!(m->flags & MOPT_SET))) {
+ ext4_msg(sb, KERN_WARNING,
+ "buggy handling of option %s", opt);
+ WARN_ON(1);
+ return -1;
+ }
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
+ }
+ return 1;
+ }
+ ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+ "or missing value", opt);
+ return -1;
+}
+
+static int parse_options(char *options, struct super_block *sb,
+ unsigned long *journal_devnum,
+ unsigned int *journal_ioprio,
+ int is_remount)
+{
+#ifdef CONFIG_QUOTA
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+#endif
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = 0;
+ token = match_token(p, tokens, args);
+ if (handle_mount_opt(sb, p, token, args, journal_devnum,
+ journal_ioprio, is_remount) < 0)
+ return 0;
+ }
+#ifdef CONFIG_QUOTA
+ if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+ clear_opt(sb, USRQUOTA);
+
+ if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+ clear_opt(sb, GRPQUOTA);
+
+ if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+ ext4_msg(sb, KERN_ERR, "old and new quota "
+ "format mixing");
+ return 0;
+ }
+
+ if (!sbi->s_jquota_fmt) {
+ ext4_msg(sb, KERN_ERR, "journaled quota format "
+ "not specified");
+ return 0;
+ }
+ } else {
+ if (sbi->s_jquota_fmt) {
+ ext4_msg(sb, KERN_ERR, "journaled quota format "
+ "specified with no journaling "
+ "enabled");
+ return 0;
+ }
+ }
+#endif
+ return 1;
+}
+
+static inline void ext4_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
+
+ if (sbi->s_qf_names[USRQUOTA])
+ seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+ if (sbi->s_qf_names[GRPQUOTA])
+ seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+ if (test_opt(sb, USRQUOTA))
+ seq_puts(seq, ",usrquota");
+
+ if (test_opt(sb, GRPQUOTA))
+ seq_puts(seq, ",grpquota");
+#endif
+}
+
+static const char *token2str(int token)
+{
+ static const struct match_token *t;
+
+ for (t = tokens; t->token != Opt_err; t++)
+ if (t->token == token && !strchr(t->pattern, '='))
+ break;
+ return t->pattern;
+}
+
+/*
+ * Show an option if
+ * - it's set to a non-default value OR
+ * - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+ int nodefs)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+ const struct mount_opts *m;
+ char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+ if (sbi->s_sb_block != 1)
+ SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+ for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+ int want_set = m->flags & MOPT_SET;
+ if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+ (m->flags & MOPT_CLEAR_ERR))
+ continue;
+ if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+ continue; /* skip if same as the default */
+ if ((want_set &&
+ (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+ (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+ continue; /* select Opt_noFoo vs Opt_Foo */
+ SEQ_OPTS_PRINT("%s", token2str(m->token));
+ }
+
+ if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+ le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+ SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
+ if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+ le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+ SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+ def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+ if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+ SEQ_OPTS_PUTS("errors=remount-ro");
+ if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+ SEQ_OPTS_PUTS("errors=continue");
+ if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+ SEQ_OPTS_PUTS("errors=panic");
+ if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+ if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+ SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+ if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+ SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (sb->s_flags & MS_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
+ if (nodefs || sbi->s_stripe)
+ SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+ if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ SEQ_OPTS_PUTS("data=journal");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ SEQ_OPTS_PUTS("data=ordered");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ SEQ_OPTS_PUTS("data=writeback");
+ }
+ if (nodefs ||
+ sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+ SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+ sbi->s_inode_readahead_blks);
+
+ if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+ (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+ SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+
+ ext4_show_quota_options(seq, sb);
+ return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+ return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ int rc;
+
+ seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+ rc = _ext4_show_options(seq, sb, 1);
+ seq_puts(seq, "\n");
+ return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+ return single_open(file, options_seq_show, PDE(inode)->data);
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+ .owner = THIS_MODULE,
+ .open = options_open_fs,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
+ int read_only)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int res = 0;
+
+ if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
+ ext4_msg(sb, KERN_ERR, "revision level too high, "
+ "forcing read-only mode");
+ res = MS_RDONLY;
+ }
+ if (read_only)
+ goto done;
+ if (!(sbi->s_mount_state & EXT4_VALID_FS))
+ ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+ "running e2fsck is recommended");
+ else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+ ext4_msg(sb, KERN_WARNING,
+ "warning: mounting fs with errors, "
+ "running e2fsck is recommended");
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
+ le16_to_cpu(es->s_mnt_count) >=
+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+ ext4_msg(sb, KERN_WARNING,
+ "warning: maximal mount count reached, "
+ "running e2fsck is recommended");
+ else if (le32_to_cpu(es->s_checkinterval) &&
+ (le32_to_cpu(es->s_lastcheck) +
+ le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+ ext4_msg(sb, KERN_WARNING,
+ "warning: checktime reached, "
+ "running e2fsck is recommended");
+ if (!sbi->s_journal)
+ es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+ es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
+ le16_add_cpu(&es->s_mnt_count, 1);
+ es->s_mtime = cpu_to_le32(get_seconds());
+ ext4_update_dynamic_rev(sb);
+ if (sbi->s_journal)
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+
+ ext4_commit_super(sb, 1);
+done:
+ if (test_opt(sb, DEBUG))
+ printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
+ "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
+ sb->s_blocksize,
+ sbi->s_groups_count,
+ EXT4_BLOCKS_PER_GROUP(sb),
+ EXT4_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt, sbi->s_mount_opt2);
+
+ cleancache_init_fs(sb);
+ return res;
+}
+
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ unsigned int groups_per_flex = 0;
+ size_t size;
+ int i;
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ /* We allocate both existing and potentially added groups */
+ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+ ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
+ size = flex_group_count * sizeof(struct flex_groups);
+ sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
+ flex_group_count);
+ goto failed;
+ }
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+
+ flex_group = ext4_flex_group(sbi, i);
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_add(ext4_free_group_clusters(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
+__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+ struct ext4_group_desc *gdp)
+{
+ __u16 crc = 0;
+
+ if (sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+ int offset = offsetof(struct ext4_group_desc, bg_checksum);
+ __le32 le_group = cpu_to_le32(block_group);
+
+ crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+ crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+ crc = crc16(crc, (__u8 *)gdp, offset);
+ offset += sizeof(gdp->bg_checksum); /* skip checksum */
+ /* for checksum of struct ext4_group_desc do the rest...*/
+ if ((sbi->s_es->s_feature_incompat &
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ crc = crc16(crc, (__u8 *)gdp + offset,
+ le16_to_cpu(sbi->s_es->s_desc_size) -
+ offset);
+ }
+
+ return cpu_to_le16(crc);
+}
+
+int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+ struct ext4_group_desc *gdp)
+{
+ if ((sbi->s_es->s_feature_ro_compat &
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
+ (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+ return 0;
+
+ return 1;
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext4_check_descriptors(struct super_block *sb,
+ ext4_group_t *first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ ext4_fsblk_t last_block;
+ ext4_fsblk_t block_bitmap;
+ ext4_fsblk_t inode_bitmap;
+ ext4_fsblk_t inode_table;
+ int flexbg_flag = 0;
+ ext4_group_t i, grp = sbi->s_groups_count;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ flexbg_flag = 1;
+
+ ext4_debug("Checking group descriptors");
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
+
+ if (i == sbi->s_groups_count - 1 || flexbg_flag)
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ else
+ last_block = first_block +
+ (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+ if ((grp == sbi->s_groups_count) &&
+ !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ grp = i;
+
+ block_bitmap = ext4_block_bitmap(sb, gdp);
+ if (block_bitmap < first_block || block_bitmap > last_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Block bitmap for group %u not in group "
+ "(block %llu)!", i, block_bitmap);
+ return 0;
+ }
+ inode_bitmap = ext4_inode_bitmap(sb, gdp);
+ if (inode_bitmap < first_block || inode_bitmap > last_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Inode bitmap for group %u not in group "
+ "(block %llu)!", i, inode_bitmap);
+ return 0;
+ }
+ inode_table = ext4_inode_table(sb, gdp);
+ if (inode_table < first_block ||
+ inode_table + sbi->s_itb_per_group - 1 > last_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Inode table for group %u not in group "
+ "(block %llu)!", i, inode_table);
+ return 0;
+ }
+ ext4_lock_group(sb, i);
+ if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Checksum for group %u failed (%u!=%u)",
+ i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ gdp)), le16_to_cpu(gdp->bg_checksum));
+ if (!(sb->s_flags & MS_RDONLY)) {
+ ext4_unlock_group(sb, i);
+ return 0;
+ }
+ }
+ ext4_unlock_group(sb, i);
+ if (!flexbg_flag)
+ first_block += EXT4_BLOCKS_PER_GROUP(sb);
+ }
+ if (NULL != first_not_zeroed)
+ *first_not_zeroed = grp;
+
+ ext4_free_blocks_count_set(sbi->s_es,
+ EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
+ sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
+ return 1;
+}
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext4_orphan_cleanup(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+ int i;
+#endif
+ if (!es->s_last_orphan) {
+ jbd_debug(4, "no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (bdev_read_only(sb->s_bdev)) {
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
+ return;
+ }
+
+ /* Check if feature set would not allow a r/w mount */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+ "unknown ROCOMPAT features");
+ return;
+ }
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ if (es->s_last_orphan)
+ jbd_debug(1, "Errors on filesystem, "
+ "clearing orphan list.\n");
+ es->s_last_orphan = 0;
+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ if (s_flags & MS_RDONLY) {
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sb->s_flags &= ~MS_RDONLY;
+ }
+#ifdef CONFIG_QUOTA
+ /* Needed for iput() to work correctly and not trash data */
+ sb->s_flags |= MS_ACTIVE;
+ /* Turn on quotas so that they are updated correctly */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (EXT4_SB(sb)->s_qf_names[i]) {
+ int ret = ext4_quota_on_mount(sb, i);
+ if (ret < 0)
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: error %d", ret);
+ }
+ }
+#endif
+
+ while (es->s_last_orphan) {
+ struct inode *inode;
+
+ inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+ if (IS_ERR(inode)) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ dquot_initialize(inode);
+ if (inode->i_nlink) {
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
+ __func__, inode->i_ino, inode->i_size);
+ jbd_debug(2, "truncating inode %lu to %lld bytes\n",
+ inode->i_ino, inode->i_size);
+ ext4_truncate(inode);
+ nr_truncates++;
+ } else {
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
+ __func__, inode->i_ino);
+ jbd_debug(2, "deleting unreferenced inode %lu\n",
+ inode->i_ino);
+ nr_orphans++;
+ }
+ iput(inode); /* The delete magic happens here! */
+ }
+
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
+
+ if (nr_orphans)
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
+ if (nr_truncates)
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+ /* Turn quotas off */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ dquot_quota_off(sb, i);
+ }
+#endif
+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+/*
+ * Maximal extent format file size.
+ * Resulting logical blkno at s_maxbytes must fit in our on-disk
+ * extent format containers, within a sector_t, and within i_blocks
+ * in the vfs. ext4 inode has 48 bits of i_block in fsblock units,
+ * so that won't be a limiting factor.
+ *
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
+ * Note, this does *not* consider any metadata overhead for vfs i_blocks.
+ */
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
+{
+ loff_t res;
+ loff_t upper_limit = MAX_LFS_FILESIZE;
+
+ /* small i_blocks in vfs inode? */
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+ /*
+ * CONFIG_LBDAF is not enabled implies the inode
+ * i_block represent total blocks in 512 bytes
+ * 32 == size of vfs inode i_blocks * 8
+ */
+ upper_limit = (1LL << 32) - 1;
+
+ /* total blocks in file system block size */
+ upper_limit >>= (blkbits - 9);
+ upper_limit <<= blkbits;
+ }
+
+ /*
+ * 32-bit extent-start container, ee_block. We lower the maxbytes
+ * by one fs block, so ee_len can cover the extent of maximum file
+ * size
+ */
+ res = (1LL << 32) - 1;
+ res <<= blkbits;
+
+ /* Sanity check against vm- & vfs- imposed limits */
+ if (res > upper_limit)
+ res = upper_limit;
+
+ return res;
+}
+
+/*
+ * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^48 sector limit.
+ */
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
+{
+ loff_t res = EXT4_NDIR_BLOCKS;
+ int meta_blocks;
+ loff_t upper_limit;
+ /* This is calculated to be the largest file size for a dense, block
+ * mapped file such that the file's total number of 512-byte sectors,
+ * including data and all indirect blocks, does not exceed (2^48 - 1).
+ *
+ * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+ * number of 512-byte sectors of the file.
+ */
+
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+ /*
+ * !has_huge_files or CONFIG_LBDAF not enabled implies that
+ * the inode i_block field represents total file blocks in
+ * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
+ */
+ upper_limit = (1LL << 32) - 1;
+
+ /* total blocks in file system block size */
+ upper_limit >>= (bits - 9);
+
+ } else {
+ /*
+ * We use 48 bit ext4_inode i_blocks
+ * With EXT4_HUGE_FILE_FL set the i_blocks
+ * represent total number of blocks in
+ * file system block size
+ */
+ upper_limit = (1LL << 48) - 1;
+
+ }
+
+ /* indirect blocks */
+ meta_blocks = 1;
+ /* double indirect blocks */
+ meta_blocks += 1 + (1LL << (bits-2));
+ /* tripple indirect blocks */
+ meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
+
+ upper_limit -= meta_blocks;
+ upper_limit <<= bits;
+
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ res <<= bits;
+ if (res > upper_limit)
+ res = upper_limit;
+
+ if (res > MAX_LFS_FILESIZE)
+ res = MAX_LFS_FILESIZE;
+
+ return res;
+}
+
+static ext4_fsblk_t descriptor_loc(struct super_block *sb,
+ ext4_fsblk_t logical_sb_block, int nr)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t bg, first_meta_bg;
+ int has_super = 0;
+
+ first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ nr < first_meta_bg)
+ return logical_sb_block + nr + 1;
+ bg = sbi->s_desc_per_block * nr;
+ if (ext4_bg_has_super(sb, bg))
+ has_super = 1;
+
+ return (has_super + ext4_group_first_block_no(sb, bg));
+}
+
+/**
+ * ext4_get_stripe_size: Get the stripe size.
+ * @sbi: In memory super block info
+ *
+ * If we have specified it via mount option, then
+ * use the mount option value. If the value specified at mount time is
+ * greater than the blocks per group use the super block value.
+ * If the super block value is greater than blocks per group return 0.
+ * Allocator needs it be less than blocks per group.
+ *
+ */
+static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
+{
+ unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
+ unsigned long stripe_width =
+ le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+ int ret;
+
+ if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
+ ret = sbi->s_stripe;
+ else if (stripe_width <= sbi->s_blocks_per_group)
+ ret = stripe_width;
+ else if (stride <= sbi->s_blocks_per_group)
+ ret = stride;
+ else
+ ret = 0;
+
+ /*
+ * If the stripe width is 1, this makes no sense and
+ * we set it to 0 to turn off stripe handling code.
+ */
+ if (ret <= 1)
+ ret = 0;
+
+ return ret;
+}
+
+/* sysfs supprt */
+
+struct ext4_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+ ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+ const char *, size_t);
+ int offset;
+};
+
+static int parse_strtoul(const char *buf,
+ unsigned long max, unsigned long *value)
+{
+ char *endp;
+
+ *value = simple_strtoul(skip_spaces(buf), &endp, 0);
+ endp = skip_spaces(endp);
+ if (*endp || *value > max)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+
+ if (parse_strtoul(buf, 0x40000000, &t))
+ return -EINVAL;
+
+ if (t && !is_power_of_2(t))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned long t;
+
+ if (parse_strtoul(buf, 0xffffffff, &t))
+ return -EINVAL;
+ *ui = t;
+ return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .offset = offsetof(struct ext4_sb_info, _elname), \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname) \
+ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+ inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+
+static struct attribute *ext4_feat_attrs[] = {
+ ATTR_LIST(lazy_itable_init),
+ ATTR_LIST(batched_discard),
+ NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
+static void ext4_feat_release(struct kobject *kobj)
+{
+ complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+ .default_attrs = ext4_feat_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_feat_release,
+};
+
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_msg(sb, KERN_ERR,
+ "Can't support bigalloc feature without "
+ "extents feature\n");
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+ struct super_block *sb = (struct super_block *) arg;
+ struct ext4_sb_info *sbi;
+ struct ext4_super_block *es;
+
+ sbi = EXT4_SB(sb);
+ es = sbi->s_es;
+
+ if (es->s_error_count)
+ ext4_msg(sb, KERN_NOTICE, "error count: %u",
+ le32_to_cpu(es->s_error_count));
+ if (es->s_first_error_time) {
+ printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+ sb->s_id, le32_to_cpu(es->s_first_error_time),
+ (int) sizeof(es->s_first_error_func),
+ es->s_first_error_func,
+ le32_to_cpu(es->s_first_error_line));
+ if (es->s_first_error_ino)
+ printk(": inode %u",
+ le32_to_cpu(es->s_first_error_ino));
+ if (es->s_first_error_block)
+ printk(": block %llu", (unsigned long long)
+ le64_to_cpu(es->s_first_error_block));
+ printk("\n");
+ }
+ if (es->s_last_error_time) {
+ printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+ sb->s_id, le32_to_cpu(es->s_last_error_time),
+ (int) sizeof(es->s_last_error_func),
+ es->s_last_error_func,
+ le32_to_cpu(es->s_last_error_line));
+ if (es->s_last_error_ino)
+ printk(": inode %u",
+ le32_to_cpu(es->s_last_error_ino));
+ if (es->s_last_error_block)
+ printk(": block %llu", (unsigned long long)
+ le64_to_cpu(es->s_last_error_block));
+ printk("\n");
+ }
+ mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+ struct ext4_group_desc *gdp = NULL;
+ ext4_group_t group, ngroups;
+ struct super_block *sb;
+ unsigned long timeout = 0;
+ int ret = 0;
+
+ sb = elr->lr_super;
+ ngroups = EXT4_SB(sb)->s_groups_count;
+
+ for (group = elr->lr_next_group; group < ngroups; group++) {
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (!gdp) {
+ ret = 1;
+ break;
+ }
+
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ break;
+ }
+
+ if (group == ngroups)
+ ret = 1;
+
+ if (!ret) {
+ timeout = jiffies;
+ ret = ext4_init_inode_table(sb, group,
+ elr->lr_timeout ? 0 : 1);
+ if (elr->lr_timeout == 0) {
+ timeout = (jiffies - timeout) *
+ elr->lr_sbi->s_li_wait_mult;
+ elr->lr_timeout = timeout;
+ }
+ elr->lr_next_sched = jiffies + elr->lr_timeout;
+ elr->lr_next_group = group + 1;
+ }
+
+ return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request structure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+ struct ext4_sb_info *sbi;
+
+ if (!elr)
+ return;
+
+ sbi = elr->lr_sbi;
+
+ list_del(&elr->lr_request);
+ sbi->s_li_request = NULL;
+ kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+ mutex_lock(&ext4_li_mtx);
+ if (!ext4_li_info) {
+ mutex_unlock(&ext4_li_mtx);
+ return;
+ }
+
+ mutex_lock(&ext4_li_info->li_list_mtx);
+ ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
+}
+
+static struct task_struct *ext4_lazyinit_task;
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+ struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+ struct list_head *pos, *n;
+ struct ext4_li_request *elr;
+ unsigned long next_wakeup, cur;
+
+ BUG_ON(NULL == eli);
+
+cont_thread:
+ while (true) {
+ next_wakeup = MAX_JIFFY_OFFSET;
+
+ mutex_lock(&eli->li_list_mtx);
+ if (list_empty(&eli->li_request_list)) {
+ mutex_unlock(&eli->li_list_mtx);
+ goto exit_thread;
+ }
+
+ list_for_each_safe(pos, n, &eli->li_request_list) {
+ elr = list_entry(pos, struct ext4_li_request,
+ lr_request);
+
+ if (time_after_eq(jiffies, elr->lr_next_sched)) {
+ if (ext4_run_li_request(elr) != 0) {
+ /* error, remove the lazy_init job */
+ ext4_remove_li_request(elr);
+ continue;
+ }
+ }
+
+ if (time_before(elr->lr_next_sched, next_wakeup))
+ next_wakeup = elr->lr_next_sched;
+ }
+ mutex_unlock(&eli->li_list_mtx);
+
+ try_to_freeze();
+
+ cur = jiffies;
+ if ((time_after_eq(cur, next_wakeup)) ||
+ (MAX_JIFFY_OFFSET == next_wakeup)) {
+ cond_resched();
+ continue;
+ }
+
+ schedule_timeout_interruptible(next_wakeup - cur);
+
+ if (kthread_should_stop()) {
+ ext4_clear_request_list();
+ goto exit_thread;
+ }
+ }
+
+exit_thread:
+ /*
+ * It looks like the request list is empty, but we need
+ * to check it under the li_list_mtx lock, to prevent any
+ * additions into it, and of course we should lock ext4_li_mtx
+ * to atomically free the list and ext4_li_info, because at
+ * this point another ext4 filesystem could be registering
+ * new one.
+ */
+ mutex_lock(&ext4_li_mtx);
+ mutex_lock(&eli->li_list_mtx);
+ if (!list_empty(&eli->li_request_list)) {
+ mutex_unlock(&eli->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
+ goto cont_thread;
+ }
+ mutex_unlock(&eli->li_list_mtx);
+ kfree(ext4_li_info);
+ ext4_li_info = NULL;
+ mutex_unlock(&ext4_li_mtx);
+
+ return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+ struct list_head *pos, *n;
+ struct ext4_li_request *elr;
+
+ mutex_lock(&ext4_li_info->li_list_mtx);
+ list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+ elr = list_entry(pos, struct ext4_li_request,
+ lr_request);
+ ext4_remove_li_request(elr);
+ }
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+ ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+ ext4_li_info, "ext4lazyinit");
+ if (IS_ERR(ext4_lazyinit_task)) {
+ int err = PTR_ERR(ext4_lazyinit_task);
+ ext4_clear_request_list();
+ kfree(ext4_li_info);
+ ext4_li_info = NULL;
+ printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
+ "initialization thread\n",
+ err);
+ return err;
+ }
+ ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+ return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+ ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+ struct ext4_group_desc *gdp = NULL;
+
+ for (group = 0; group < ngroups; group++) {
+ gdp = ext4_get_group_desc(sb, group, NULL);
+ if (!gdp)
+ continue;
+
+ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+ break;
+ }
+
+ return group;
+}
+
+static int ext4_li_info_new(void)
+{
+ struct ext4_lazy_init *eli = NULL;
+
+ eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+ if (!eli)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&eli->li_request_list);
+ mutex_init(&eli->li_list_mtx);
+
+ eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+ ext4_li_info = eli;
+
+ return 0;
+}
+
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+ ext4_group_t start)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_li_request *elr;
+ unsigned long rnd;
+
+ elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+ if (!elr)
+ return NULL;
+
+ elr->lr_super = sb;
+ elr->lr_sbi = sbi;
+ elr->lr_next_group = start;
+
+ /*
+ * Randomize first schedule time of the request to
+ * spread the inode table initialization requests
+ * better.
+ */
+ get_random_bytes(&rnd, sizeof(rnd));
+ elr->lr_next_sched = jiffies + (unsigned long)rnd %
+ (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+
+ return elr;
+}
+
+static int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_li_request *elr;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ int ret = 0;
+
+ if (sbi->s_li_request != NULL) {
+ /*
+ * Reset timeout so it can be computed again, because
+ * s_li_wait_mult might have changed.
+ */
+ sbi->s_li_request->lr_timeout = 0;
+ return 0;
+ }
+
+ if (first_not_zeroed == ngroups ||
+ (sb->s_flags & MS_RDONLY) ||
+ !test_opt(sb, INIT_INODE_TABLE))
+ return 0;
+
+ elr = ext4_li_request_new(sb, first_not_zeroed);
+ if (!elr)
+ return -ENOMEM;
+
+ mutex_lock(&ext4_li_mtx);
+
+ if (NULL == ext4_li_info) {
+ ret = ext4_li_info_new();
+ if (ret)
+ goto out;
+ }
+
+ mutex_lock(&ext4_li_info->li_list_mtx);
+ list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+
+ sbi->s_li_request = elr;
+ /*
+ * set elr to NULL here since it has been inserted to
+ * the request_list and the removal and free of it is
+ * handled by ext4_clear_request_list from now on.
+ */
+ elr = NULL;
+
+ if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+ ret = ext4_run_lazyinit_thread();
+ if (ret)
+ goto out;
+ }
+out:
+ mutex_unlock(&ext4_li_mtx);
+ if (ret)
+ kfree(elr);
+ return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+ /*
+ * If thread exited earlier
+ * there's nothing to be done.
+ */
+ if (!ext4_li_info || !ext4_lazyinit_task)
+ return;
+
+ kthread_stop(ext4_lazyinit_task);
+}
+
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+{
+ char *orig_data = kstrdup(data, GFP_KERNEL);
+ struct buffer_head *bh;
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi;
+ ext4_fsblk_t block;
+ ext4_fsblk_t sb_block = get_sb_block(&data);
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ unsigned long journal_devnum = 0;
+ unsigned long def_mount_opts;
+ struct inode *root;
+ char *cp;
+ const char *descr;
+ int ret = -ENOMEM;
+ int blocksize, clustersize;
+ unsigned int db_count;
+ unsigned int i;
+ int needs_recovery, has_huge_files, has_bigalloc;
+ __u64 blocks_count;
+ int err;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ext4_group_t first_not_zeroed;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ goto out_free_orig;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ goto out_free_orig;
+ }
+ sb->s_fs_info = sbi;
+ sbi->s_mount_opt = 0;
+ sbi->s_resuid = EXT4_DEF_RESUID;
+ sbi->s_resgid = EXT4_DEF_RESGID;
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
+ sbi->s_sb_block = sb_block;
+ if (sb->s_bdev->bd_part)
+ sbi->s_sectors_written_start =
+ part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+
+ /* Cleanup superblock name */
+ for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+ *cp = '!';
+
+ ret = -EINVAL;
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
+ goto out_fail;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sb_block;
+ }
+
+ if (!(bh = sb_bread(sb, logical_sb_block))) {
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ goto out_fail;
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC)
+ goto cantfind_ext4;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+ /* Set defaults before we parse the mount options */
+ def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+ set_opt(sb, INIT_INODE_TABLE);
+ if (def_mount_opts & EXT4_DEFM_DEBUG)
+ set_opt(sb, DEBUG);
+ if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+ set_opt(sb, GRPID);
+ if (def_mount_opts & EXT4_DEFM_UID16)
+ set_opt(sb, NO_UID32);
+ /* xattr user namespace & acls are now defaulted on */
+#ifdef CONFIG_EXT4_FS_XATTR
+ set_opt(sb, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ set_opt(sb, POSIX_ACL);
+#endif
+ set_opt(sb, MBLK_IO_SUBMIT);
+ if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
+ set_opt(sb, JOURNAL_DATA);
+ else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
+ set_opt(sb, ORDERED_DATA);
+ else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
+ set_opt(sb, WRITEBACK_DATA);
+
+ if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ set_opt(sb, ERRORS_PANIC);
+ else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
+ set_opt(sb, ERRORS_CONT);
+ else
+ set_opt(sb, ERRORS_RO);
+ if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+ set_opt(sb, BLOCK_VALIDITY);
+ if (def_mount_opts & EXT4_DEFM_DISCARD)
+ set_opt(sb, DISCARD);
+
+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+ sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+ sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+
+ if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+ set_opt(sb, BARRIER);
+
+ /*
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
+ */
+ if (!IS_EXT3_SB(sb) &&
+ ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
+ set_opt(sb, DELALLOC);
+
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
+ if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+ &journal_devnum, &journal_ioprio, 0)) {
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ sbi->s_es->s_mount_opts);
+ }
+ sbi->s_def_mount_opt = sbi->s_mount_opt;
+ if (!parse_options((char *) data, sb, &journal_devnum,
+ &journal_ioprio, 0))
+ goto failed_mount;
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+ "with data=journal disables delayed "
+ "allocation and O_DIRECT support!\n");
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ }
+
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ if (blocksize < PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ goto failed_mount;
+ }
+ }
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
+ (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ ext4_msg(sb, KERN_WARNING,
+ "feature flags set on rev 0 fs, "
+ "running e2fsck is recommended");
+
+ if (IS_EXT2_SB(sb)) {
+ if (ext2_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
+ if (IS_EXT3_SB(sb)) {
+ if (ext3_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
+ /*
+ * Check feature flags regardless of the revision level, since we
+ * previously didn't change the revision level when setting the flags,
+ * so there is a chance incompat flags are set on a rev 0 filesystem.
+ */
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
+ goto failed_mount;
+
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+ blocksize > EXT4_MAX_BLOCK_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d", blocksize);
+ goto failed_mount;
+ }
+
+ if (sb->s_blocksize != blocksize) {
+ /* Validate the filesystem blocksize */
+ if (!sb_set_blocksize(sb, blocksize)) {
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
+ blocksize);
+ goto failed_mount;
+ }
+
+ brelse(bh);
+ logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = sb_bread(sb, logical_sb_block);
+ if (!bh) {
+ ext4_msg(sb, KERN_ERR,
+ "Can't read superblock on 2nd try");
+ goto failed_mount;
+ }
+ es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ ext4_msg(sb, KERN_ERR,
+ "Magic mismatch, very weird!");
+ goto failed_mount;
+ }
+ }
+
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
+
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+ sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+ sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+ } else {
+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+ (!is_power_of_2(sbi->s_inode_size)) ||
+ (sbi->s_inode_size > blocksize)) {
+ ext4_msg(sb, KERN_ERR,
+ "unsupported inode size: %d",
+ sbi->s_inode_size);
+ goto failed_mount;
+ }
+ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
+ sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+ }
+
+ sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+ sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+ !is_power_of_2(sbi->s_desc_size)) {
+ ext4_msg(sb, KERN_ERR,
+ "unsupported descriptor size %lu",
+ sbi->s_desc_size);
+ goto failed_mount;
+ }
+ } else
+ sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+ if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
+ goto cantfind_ext4;
+
+ sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0)
+ goto cantfind_ext4;
+ sbi->s_itb_per_group = sbi->s_inodes_per_group /
+ sbi->s_inodes_per_block;
+ sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+ sbi->s_sbh = bh;
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
+ for (i = 0; i < 4; i++)
+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+ sbi->s_def_hash_version = es->s_def_hash_version;
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
+#else
+ es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+ }
+
+ /* Handle clustersize */
+ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+ has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ if (has_bigalloc) {
+ if (clustersize < blocksize) {
+ ext4_msg(sb, KERN_ERR,
+ "cluster size (%d) smaller than "
+ "block size (%d)", clustersize, blocksize);
+ goto failed_mount;
+ }
+ sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+ le32_to_cpu(es->s_log_block_size);
+ sbi->s_clusters_per_group =
+ le32_to_cpu(es->s_clusters_per_group);
+ if (sbi->s_clusters_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#clusters per group too big: %lu",
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_blocks_per_group !=
+ (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+ ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+ "clusters per group (%lu) inconsistent",
+ sbi->s_blocks_per_group,
+ sbi->s_clusters_per_group);
+ goto failed_mount;
+ }
+ } else {
+ if (clustersize != blocksize) {
+ ext4_warning(sb, "fragment/cluster size (%d) != "
+ "block size (%d)", clustersize,
+ blocksize);
+ clustersize = blocksize;
+ }
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
+ sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+ sbi->s_cluster_bits = 0;
+ }
+ sbi->s_cluster_ratio = clustersize / blocksize;
+
+ if (sbi->s_inodes_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR,
+ "#inodes per group too big: %lu",
+ sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
+
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ err = generic_check_addressable(sb->s_blocksize_bits,
+ ext4_blocks_count(es));
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "filesystem"
+ " too large to mount safely on this system");
+ if (sizeof(sector_t) < 8)
+ ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = err;
+ goto failed_mount;
+ }
+
+ if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+ goto cantfind_ext4;
+
+ /* check blocks count against device size */
+ blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+ if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
+ ext4_blocks_count(es), blocks_count);
+ goto failed_mount;
+ }
+
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
+ goto failed_mount;
+ }
+ blocks_count = (ext4_blocks_count(es) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+ "(block count %llu, first data block %u, "
+ "blocks per group %lu)", sbi->s_groups_count,
+ ext4_blocks_count(es),
+ le32_to_cpu(es->s_first_data_block),
+ EXT4_BLOCKS_PER_GROUP(sb));
+ goto failed_mount;
+ }
+ sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ sbi->s_group_desc = ext4_kvmalloc(db_count *
+ sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (sbi->s_group_desc == NULL) {
+ ext4_msg(sb, KERN_ERR, "not enough memory");
+ goto failed_mount;
+ }
+
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+ if (sbi->s_proc)
+ proc_create_data("options", S_IRUGO, sbi->s_proc,
+ &ext4_seq_options_fops, sb);
+
+ bgl_lock_init(sbi->s_blockgroup_lock);
+
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ sbi->s_group_desc[i] = sb_bread(sb, block);
+ if (!sbi->s_group_desc[i]) {
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
+ db_count = i;
+ goto failed_mount2;
+ }
+ }
+ if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ goto failed_mount2;
+ }
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "unable to initialize "
+ "flex_bg meta info!");
+ goto failed_mount2;
+ }
+
+ sbi->s_gdb_count = db_count;
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+ spin_lock_init(&sbi->s_next_gen_lock);
+
+ init_timer(&sbi->s_err_report);
+ sbi->s_err_report.function = print_daily_error_info;
+ sbi->s_err_report.data = (unsigned long) sb;
+
+ err = percpu_counter_init(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+ }
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount3;
+ }
+
+ sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_max_writeback_mb_bump = 128;
+
+ /*
+ * set up enough so that it can read an inode
+ */
+ if (!test_opt(sb, NOLOAD) &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ sb->s_op = &ext4_sops;
+ else
+ sb->s_op = &ext4_nojournal_sops;
+ sb->s_export_op = &ext4_export_ops;
+ sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_QUOTA
+ sb->s_qcop = &ext4_qctl_operations;
+ sb->dq_op = &ext4_quota_operations;
+#endif
+ memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+ mutex_init(&sbi->s_orphan_lock);
+ sbi->s_resize_flags = 0;
+
+ sb->s_root = NULL;
+
+ needs_recovery = (es->s_last_orphan != 0 ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_RECOVER));
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+ !(sb->s_flags & MS_RDONLY))
+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ goto failed_mount3;
+
+ /*
+ * The first inode we look at is the journal inode. Don't try
+ * root first: it may be modified in the journal!
+ */
+ if (!test_opt(sb, NOLOAD) &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (ext4_load_journal(sb, es, journal_devnum))
+ goto failed_mount3;
+ } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ ext4_msg(sb, KERN_ERR, "required journal recovery "
+ "suppressed and not mounted read-only");
+ goto failed_mount_wq;
+ } else {
+ clear_opt(sb, DATA_FLAGS);
+ sbi->s_journal = NULL;
+ needs_recovery = 0;
+ goto no_journal;
+ }
+
+ if (ext4_blocks_count(es) > 0xffffffffULL &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+ goto failed_mount_wq;
+ }
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else {
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+ set_opt(sb, ORDERED_DATA);
+ else
+ set_opt(sb, JOURNAL_DATA);
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
+ goto failed_mount_wq;
+ }
+ default:
+ break;
+ }
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+ sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
+ /*
+ * The journal may have updated the bg summary counts, so we
+ * need to update the global counters.
+ */
+ percpu_counter_set(&sbi->s_freeclusters_counter,
+ ext4_count_free_clusters(sb));
+ percpu_counter_set(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ percpu_counter_set(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
+
+no_journal:
+ /*
+ * The maximum number of concurrent works can be high and
+ * concurrency isn't really necessary. Limit it to 1.
+ */
+ EXT4_SB(sb)->dio_unwritten_wq =
+ alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
+ /*
+ * The jbd2_journal_load will have done any necessary log recovery,
+ * so we can safely mount the rest of the filesystem now.
+ */
+
+ root = ext4_iget(sb, EXT4_ROOT_INO);
+ if (IS_ERR(root)) {
+ ext4_msg(sb, KERN_ERR, "get root inode failed");
+ ret = PTR_ERR(root);
+ root = NULL;
+ goto failed_mount4;
+ }
+ if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+ ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+ iput(root);
+ goto failed_mount4;
+ }
+ sb->s_root = d_make_root(root);
+ if (!sb->s_root) {
+ ext4_msg(sb, KERN_ERR, "get root dentry failed");
+ ret = -ENOMEM;
+ goto failed_mount4;
+ }
+
+ if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+ sb->s_flags |= MS_RDONLY;
+
+ /* determine the minimum size of new large inodes, if present */
+ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+ if (sbi->s_want_extra_isize <
+ le16_to_cpu(es->s_want_extra_isize))
+ sbi->s_want_extra_isize =
+ le16_to_cpu(es->s_want_extra_isize);
+ if (sbi->s_want_extra_isize <
+ le16_to_cpu(es->s_min_extra_isize))
+ sbi->s_want_extra_isize =
+ le16_to_cpu(es->s_min_extra_isize);
+ }
+ }
+ /* Check if enough inode space is available */
+ if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+ sbi->s_inode_size) {
+ sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ ext4_msg(sb, KERN_INFO, "required extra inode space not"
+ "available");
+ }
+
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)", err);
+ goto failed_mount4a;
+ }
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb, needs_recovery);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+ err);
+ goto failed_mount5;
+ }
+
+ err = ext4_register_li_request(sb, first_not_zeroed);
+ if (err)
+ goto failed_mount6;
+
+ sbi->s_kobj.kset = ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ goto failed_mount7;
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
+ ext4_orphan_cleanup(sb, es);
+ EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ if (needs_recovery) {
+ ext4_msg(sb, KERN_INFO, "recovery complete");
+ ext4_mark_recovery_complete(sb, es);
+ }
+ if (EXT4_SB(sb)->s_journal) {
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ descr = " journalled data mode";
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ descr = " ordered data mode";
+ else
+ descr = " writeback data mode";
+ } else
+ descr = "out journal";
+
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+ *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
+
+ if (es->s_error_count)
+ mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+
+ kfree(orig_data);
+ return 0;
+
+cantfind_ext4:
+ if (!silent)
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
+ goto failed_mount;
+
+failed_mount7:
+ ext4_unregister_li_request(sb);
+failed_mount6:
+ ext4_mb_release(sb);
+failed_mount5:
+ ext4_ext_release(sb);
+ ext4_release_system_zone(sb);
+failed_mount4a:
+ dput(sb->s_root);
+ sb->s_root = NULL;
+failed_mount4:
+ ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
+ if (sbi->s_journal) {
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ }
+failed_mount3:
+ del_timer(&sbi->s_err_report);
+ if (sbi->s_flex_groups)
+ ext4_kvfree(sbi->s_flex_groups);
+ percpu_counter_destroy(&sbi->s_freeclusters_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
+failed_mount2:
+ for (i = 0; i < db_count; i++)
+ brelse(sbi->s_group_desc[i]);
+ ext4_kvfree(sbi->s_group_desc);
+failed_mount:
+ if (sbi->s_proc) {
+ remove_proc_entry("options", sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+#endif
+ ext4_blkdev_remove(sbi);
+ brelse(bh);
+out_fail:
+ sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
+ kfree(sbi);
+out_free_orig:
+ kfree(orig_data);
+ return ret;
+}
+
+/*
+ * Setup any per-fs journal parameters now. We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ journal->j_commit_interval = sbi->s_commit_interval;
+ journal->j_min_batch_time = sbi->s_min_batch_time;
+ journal->j_max_batch_time = sbi->s_max_batch_time;
+
+ write_lock(&journal->j_state_lock);
+ if (test_opt(sb, BARRIER))
+ journal->j_flags |= JBD2_BARRIER;
+ else
+ journal->j_flags &= ~JBD2_BARRIER;
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+ write_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext4_get_journal(struct super_block *sb,
+ unsigned int journal_inum)
+{
+ struct inode *journal_inode;
+ journal_t *journal;
+
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+ /* First, test for the existence of a valid inode on disk. Bad
+ * things happen if we iget() an unused inode, as the subsequent
+ * iput() will try to delete it. */
+
+ journal_inode = ext4_iget(sb, journal_inum);
+ if (IS_ERR(journal_inode)) {
+ ext4_msg(sb, KERN_ERR, "no journal found");
+ return NULL;
+ }
+ if (!journal_inode->i_nlink) {
+ make_bad_inode(journal_inode);
+ iput(journal_inode);
+ ext4_msg(sb, KERN_ERR, "journal inode is deleted");
+ return NULL;
+ }
+
+ jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
+ journal_inode, journal_inode->i_size);
+ if (!S_ISREG(journal_inode->i_mode)) {
+ ext4_msg(sb, KERN_ERR, "invalid journal inode");
+ iput(journal_inode);
+ return NULL;
+ }
+
+ journal = jbd2_journal_init_inode(journal_inode);
+ if (!journal) {
+ ext4_msg(sb, KERN_ERR, "Could not load journal inode");
+ iput(journal_inode);
+ return NULL;
+ }
+ journal->j_private = sb;
+ ext4_init_journal_params(sb, journal);
+ return journal;
+}
+
+static journal_t *ext4_get_dev_journal(struct super_block *sb,
+ dev_t j_dev)
+{
+ struct buffer_head *bh;
+ journal_t *journal;
+ ext4_fsblk_t start;
+ ext4_fsblk_t len;
+ int hblock, blocksize;
+ ext4_fsblk_t sb_block;
+ unsigned long offset;
+ struct ext4_super_block *es;
+ struct block_device *bdev;
+
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+ bdev = ext4_blkdev_get(j_dev, sb);
+ if (bdev == NULL)
+ return NULL;
+
+ blocksize = sb->s_blocksize;
+ hblock = bdev_logical_block_size(bdev);
+ if (blocksize < hblock) {
+ ext4_msg(sb, KERN_ERR,
+ "blocksize too small for journal device");
+ goto out_bdev;
+ }
+
+ sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
+ offset = EXT4_MIN_BLOCK_SIZE % blocksize;
+ set_blocksize(bdev, blocksize);
+ if (!(bh = __bread(bdev, sb_block, blocksize))) {
+ ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+ "external journal");
+ goto out_bdev;
+ }
+
+ es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
+ !(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+ ext4_msg(sb, KERN_ERR, "external journal has "
+ "bad superblock");
+ brelse(bh);
+ goto out_bdev;
+ }
+
+ if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+ ext4_msg(sb, KERN_ERR, "journal UUID does not match");
+ brelse(bh);
+ goto out_bdev;
+ }
+
+ len = ext4_blocks_count(es);
+ start = sb_block + 1;
+ brelse(bh); /* we're done with the superblock */
+
+ journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
+ start, len, blocksize);
+ if (!journal) {
+ ext4_msg(sb, KERN_ERR, "failed to create device journal");
+ goto out_bdev;
+ }
+ journal->j_private = sb;
+ ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ wait_on_buffer(journal->j_sb_buffer);
+ if (!buffer_uptodate(journal->j_sb_buffer)) {
+ ext4_msg(sb, KERN_ERR, "I/O error on journal device");
+ goto out_journal;
+ }
+ if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+ ext4_msg(sb, KERN_ERR, "External journal has more than one "
+ "user (unsupported) - %d",
+ be32_to_cpu(journal->j_superblock->s_nr_users));
+ goto out_journal;
+ }
+ EXT4_SB(sb)->journal_bdev = bdev;
+ ext4_init_journal_params(sb, journal);
+ return journal;
+
+out_journal:
+ jbd2_journal_destroy(journal);
+out_bdev:
+ ext4_blkdev_put(bdev);
+ return NULL;
+}
+
+static int ext4_load_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ unsigned long journal_devnum)
+{
+ journal_t *journal;
+ unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+ dev_t journal_dev;
+ int err = 0;
+ int really_read_only;
+
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+ if (journal_devnum &&
+ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+ ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+ "numbers have changed");
+ journal_dev = new_decode_dev(journal_devnum);
+ } else
+ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+ really_read_only = bdev_read_only(sb->s_bdev);
+
+ /*
+ * Are we loading a blank journal or performing recovery after a
+ * crash? For recovery, we need to check in advance whether we
+ * can get read-write access to the device.
+ */
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ if (sb->s_flags & MS_RDONLY) {
+ ext4_msg(sb, KERN_INFO, "INFO: recovery "
+ "required on readonly filesystem");
+ if (really_read_only) {
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, cannot proceed");
+ return -EROFS;
+ }
+ ext4_msg(sb, KERN_INFO, "write access will "
+ "be enabled during recovery");
+ }
+ }
+
+ if (journal_inum && journal_dev) {
+ ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+ "and inode journals!");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ if (!(journal = ext4_get_journal(sb, journal_inum)))
+ return -EINVAL;
+ } else {
+ if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
+ return -EINVAL;
+ }
+
+ if (!(journal->j_flags & JBD2_BARRIER))
+ ext4_msg(sb, KERN_INFO, "barriers disabled");
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+ err = jbd2_journal_wipe(journal, !really_read_only);
+ if (!err) {
+ char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+ if (save)
+ memcpy(save, ((char *) es) +
+ EXT4_S_ERR_START, EXT4_S_ERR_LEN);
+ err = jbd2_journal_load(journal);
+ if (save)
+ memcpy(((char *) es) + EXT4_S_ERR_START,
+ save, EXT4_S_ERR_LEN);
+ kfree(save);
+ }
+
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "error loading journal");
+ jbd2_journal_destroy(journal);
+ return err;
+ }
+
+ EXT4_SB(sb)->s_journal = journal;
+ ext4_clear_journal_err(sb, es);
+
+ if (!really_read_only && journal_devnum &&
+ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+ es->s_journal_dev = cpu_to_le32(journal_devnum);
+
+ /* Make sure we flush the recovery flag to disk. */
+ ext4_commit_super(sb, 1);
+ }
+
+ return 0;
+}
+
+static int ext4_commit_super(struct super_block *sb, int sync)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+ int error = 0;
+
+ if (!sbh || block_device_ejected(sb))
+ return error;
+ if (buffer_write_io_error(sbh)) {
+ /*
+ * Oh, dear. A previous attempt to write the
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ ext4_msg(sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
+ if (sb->s_bdev->bd_part)
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
+ else
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+ ext4_free_blocks_count_set(es,
+ EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeclusters_counter)));
+ es->s_free_inodes_count =
+ cpu_to_le32(percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeinodes_counter));
+ sb->s_dirt = 0;
+ BUFFER_TRACE(sbh, "marking dirty");
+ mark_buffer_dirty(sbh);
+ if (sync) {
+ error = sync_dirty_buffer(sbh);
+ if (error)
+ return error;
+
+ error = buffer_write_io_error(sbh);
+ if (error) {
+ ext4_msg(sb, KERN_ERR, "I/O error while writing "
+ "superblock");
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ }
+ return error;
+}
+
+/*
+ * Have we just finished recovery? If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk. Record that fact.
+ */
+static void ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ BUG_ON(journal != NULL);
+ return;
+ }
+ jbd2_journal_lock_updates(journal);
+ if (jbd2_journal_flush(journal) < 0)
+ goto out;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+ sb->s_flags & MS_RDONLY) {
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, 1);
+ }
+
+out:
+ jbd2_journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es)
+{
+ journal_t *journal;
+ int j_errno;
+ const char *errstr;
+
+ BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
+ journal = EXT4_SB(sb)->s_journal;
+
+ /*
+ * Now check for any error status which may have been recorded in the
+ * journal by a prior ext4_error() or ext4_abort()
+ */
+
+ j_errno = jbd2_journal_errno(journal);
+ if (j_errno) {
+ char nbuf[16];
+
+ errstr = ext4_decode_error(sb, j_errno, nbuf);
+ ext4_warning(sb, "Filesystem error recorded "
+ "from previous mount: %s", errstr);
+ ext4_warning(sb, "Marking fs in need of filesystem check.");
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ ext4_commit_super(sb, 1);
+
+ jbd2_journal_clear_err(journal);
+ }
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext4_force_commit(struct super_block *sb)
+{
+ journal_t *journal;
+ int ret = 0;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ journal = EXT4_SB(sb)->s_journal;
+ if (journal) {
+ vfs_check_frozen(sb, SB_FREEZE_TRANS);
+ ret = ext4_journal_force_commit(journal);
+ }
+
+ return ret;
+}
+
+static void ext4_write_super(struct super_block *sb)
+{
+ lock_super(sb);
+ ext4_commit_super(sb, 1);
+ unlock_super(sb);
+}
+
+static int ext4_sync_fs(struct super_block *sb, int wait)
+{
+ int ret = 0;
+ tid_t target;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ trace_ext4_sync_fs(sb, wait);
+ flush_workqueue(sbi->dio_unwritten_wq);
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+ if (wait)
+ jbd2_log_wait_commit(sbi->s_journal, target);
+ }
+ return ret;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created. This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
+ * the upper layer.
+ */
+static int ext4_freeze(struct super_block *sb)
+{
+ int error = 0;
+ journal_t *journal;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ journal = EXT4_SB(sb)->s_journal;
+
+ /* Now we set up the journal barrier. */
+ jbd2_journal_lock_updates(journal);
+
+ /*
+ * Don't clear the needs_recovery flag if we failed to flush
+ * the journal.
+ */
+ error = jbd2_journal_flush(journal);
+ if (error < 0)
+ goto out;
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ error = ext4_commit_super(sb, 1);
+out:
+ /* we rely on s_frozen to stop further updates */
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return error;
+}
+
+/*
+ * Called by LVM after the snapshot is done. We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static int ext4_unfreeze(struct super_block *sb)
+{
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ lock_super(sb);
+ /* Reset the needs_recovery flag before the fs is unlocked. */
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, 1);
+ unlock_super(sb);
+ return 0;
+}
+
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+ unsigned long s_mount_opt;
+ unsigned long s_mount_opt2;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned long s_commit_interval;
+ u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
+{
+ struct ext4_super_block *es;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned long old_sb_flags;
+ struct ext4_mount_options old_opts;
+ int enable_quota = 0;
+ ext4_group_t g;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ int err = 0;
+#ifdef CONFIG_QUOTA
+ int i;
+#endif
+ char *orig_data = kstrdup(data, GFP_KERNEL);
+
+ /* Store the original options */
+ lock_super(sb);
+ old_sb_flags = sb->s_flags;
+ old_opts.s_mount_opt = sbi->s_mount_opt;
+ old_opts.s_mount_opt2 = sbi->s_mount_opt2;
+ old_opts.s_resuid = sbi->s_resuid;
+ old_opts.s_resgid = sbi->s_resgid;
+ old_opts.s_commit_interval = sbi->s_commit_interval;
+ old_opts.s_min_batch_time = sbi->s_min_batch_time;
+ old_opts.s_max_batch_time = sbi->s_max_batch_time;
+#ifdef CONFIG_QUOTA
+ old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++)
+ old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
+ if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+
+ /*
+ * Allow the "check" option to be passed as a remount option.
+ */
+ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ext4_abort(sb, "Abort forced by user");
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+
+ es = sbi->s_es;
+
+ if (sbi->s_journal) {
+ ext4_init_journal_params(sb, sbi->s_journal);
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ }
+
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+
+ if (*flags & MS_RDONLY) {
+ err = dquot_suspend(sb, -1);
+ if (err < 0)
+ goto restore_opts;
+
+ /*
+ * First of all, the unconditional stuff we have to do
+ * to disable replay of the journal when we next remount
+ */
+ sb->s_flags |= MS_RDONLY;
+
+ /*
+ * OK, test if we are remounting a valid rw partition
+ * readonly, and if so set the rdonly flag and then
+ * mark the partition as valid again.
+ */
+ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
+ (sbi->s_mount_state & EXT4_VALID_FS))
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+ if (sbi->s_journal)
+ ext4_mark_recovery_complete(sb, es);
+ } else {
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+ /*
+ * Make sure the group descriptor checksums
+ * are sane. If they aren't, refuse to remount r/w.
+ */
+ for (g = 0; g < sbi->s_groups_count; g++) {
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, g, NULL);
+
+ if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+ ext4_msg(sb, KERN_ERR,
+ "ext4_remount: Checksum for group %u failed (%u!=%u)",
+ g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ le16_to_cpu(gdp->bg_checksum));
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
+ /*
+ * If we have an unprocessed orphan list hanging
+ * around from a previously readonly bdev mount,
+ * require a full umount/remount for now.
+ */
+ if (es->s_last_orphan) {
+ ext4_msg(sb, KERN_WARNING, "Couldn't "
+ "remount RDWR because of unprocessed "
+ "orphan inode list. Please "
+ "umount/remount instead");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
+ /*
+ * Mounting a RDONLY partition read-write, so reread
+ * and store the current valid flag. (It may have
+ * been changed by e2fsck since we originally mounted
+ * the partition.)
+ */
+ if (sbi->s_journal)
+ ext4_clear_journal_err(sb, es);
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ if (!ext4_setup_super(sb, es, 0))
+ sb->s_flags &= ~MS_RDONLY;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block))) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+ enable_quota = 1;
+ }
+ }
+
+ /*
+ * Reinitialize lazy itable initialization thread based on
+ * current settings
+ */
+ if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+ ext4_unregister_li_request(sb);
+ else {
+ ext4_group_t first_not_zeroed;
+ first_not_zeroed = ext4_has_uninit_itable(sb);
+ ext4_register_li_request(sb, first_not_zeroed);
+ }
+
+ ext4_setup_system_zone(sb);
+ if (sbi->s_journal == NULL)
+ ext4_commit_super(sb, 1);
+
+#ifdef CONFIG_QUOTA
+ /* Release old quota file names */
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (old_opts.s_qf_names[i] &&
+ old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+ kfree(old_opts.s_qf_names[i]);
+#endif
+ unlock_super(sb);
+ if (enable_quota)
+ dquot_resume(sb, -1);
+
+ ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+ kfree(orig_data);
+ return 0;
+
+restore_opts:
+ sb->s_flags = old_sb_flags;
+ sbi->s_mount_opt = old_opts.s_mount_opt;
+ sbi->s_mount_opt2 = old_opts.s_mount_opt2;
+ sbi->s_resuid = old_opts.s_resuid;
+ sbi->s_resgid = old_opts.s_resgid;
+ sbi->s_commit_interval = old_opts.s_commit_interval;
+ sbi->s_min_batch_time = old_opts.s_min_batch_time;
+ sbi->s_max_batch_time = old_opts.s_max_batch_time;
+#ifdef CONFIG_QUOTA
+ sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (sbi->s_qf_names[i] &&
+ old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+ kfree(sbi->s_qf_names[i]);
+ sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+ }
+#endif
+ unlock_super(sb);
+ kfree(orig_data);
+ return err;
+}
+
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc. This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time. We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock. If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ struct ext4_group_desc *gdp;
+ u64 fsid;
+ s64 bfree;
+
+ if (test_opt(sb, MINIX_DF)) {
+ sbi->s_overhead_last = 0;
+ } else if (es->s_overhead_clusters) {
+ sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters);
+ } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ ext4_fsblk_t overhead = 0;
+
+ /*
+ * Compute the overhead (FS structures). This is constant
+ * for a given filesystem unless the number of block groups
+ * changes so we cache the previous value until it does.
+ */
+
+ /*
+ * All of the blocks before first_data_block are
+ * overhead
+ */
+ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
+
+ /*
+ * Add the overhead found in each block group
+ */
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ overhead += ext4_num_overhead_clusters(sb, i, gdp);
+ cond_resched();
+ }
+ sbi->s_overhead_last = overhead;
+ smp_wmb();
+ sbi->s_blocks_last = ext4_blocks_count(es);
+ }
+
+ buf->f_type = EXT4_SUPER_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = (ext4_blocks_count(es) -
+ EXT4_C2B(sbi, sbi->s_overhead_last));
+ bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+ /* prevent underflow in case that few free space is available */
+ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+ buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+ if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_bavail = 0;
+ buf->f_files = le32_to_cpu(es->s_inodes_count);
+ buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
+ buf->f_namelen = EXT4_NAME_LEN;
+ fsid = le64_to_cpup((void *)es->s_uuid) ^
+ le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
+ return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
+ * Process 1 Process 2
+ * ext4_create() quota_sync()
+ * jbd2_journal_start() write_dquot()
+ * dquot_initialize() down(dqio_mutex)
+ * down(dqio_mutex) jbd2_journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+}
+
+static int ext4_write_dquot(struct dquot *dquot)
+{
+ int ret, err;
+ handle_t *handle;
+ struct inode *inode;
+
+ inode = dquot_to_inode(dquot);
+ handle = ext4_journal_start(inode,
+ EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_commit(dquot);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_acquire_dquot(struct dquot *dquot)
+{
+ int ret, err;
+ handle_t *handle;
+
+ handle = ext4_journal_start(dquot_to_inode(dquot),
+ EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_acquire(dquot);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_release_dquot(struct dquot *dquot)
+{
+ int ret, err;
+ handle_t *handle;
+
+ handle = ext4_journal_start(dquot_to_inode(dquot),
+ EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ if (IS_ERR(handle)) {
+ /* Release dquot anyway to avoid endless cycle in dqput() */
+ dquot_release(dquot);
+ return PTR_ERR(handle);
+ }
+ ret = dquot_release(dquot);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_mark_dquot_dirty(struct dquot *dquot)
+{
+ /* Are we journaling quotas? */
+ if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+ EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+ dquot_mark_dquot_dirty(dquot);
+ return ext4_write_dquot(dquot);
+ } else {
+ return dquot_mark_dquot_dirty(dquot);
+ }
+}
+
+static int ext4_write_info(struct super_block *sb, int type)
+{
+ int ret, err;
+ handle_t *handle;
+
+ /* Data block + inode block */
+ handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_commit_info(sb, type);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+ return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+ EXT4_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+ struct path *path)
+{
+ int err;
+
+ if (!test_opt(sb, QUOTA))
+ return -EINVAL;
+
+ /* Quotafile not on the same filesystem? */
+ if (path->dentry->d_sb != sb)
+ return -EXDEV;
+ /* Journaling quota? */
+ if (EXT4_SB(sb)->s_qf_names[type]) {
+ /* Quotafile not in fs root? */
+ if (path->dentry->d_parent != sb->s_root)
+ ext4_msg(sb, KERN_WARNING,
+ "Quota file not on filesystem root. "
+ "Journaled quota will not work");
+ }
+
+ /*
+ * When we journal data on quota file, we have to flush journal to see
+ * all updates to the file when we bypass pagecache...
+ */
+ if (EXT4_SB(sb)->s_journal &&
+ ext4_should_journal_data(path->dentry->d_inode)) {
+ /*
+ * We don't need to lock updates but journal_flush() could
+ * otherwise be livelocked...
+ */
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err)
+ return err;
+ }
+
+ return dquot_quota_on(sb, type, format_id, path);
+}
+
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ handle_t *handle;
+
+ /* Force all delayed allocation blocks to be allocated.
+ * Caller already holds s_umount sem */
+ if (test_opt(sb, DELALLOC))
+ sync_filesystem(sb);
+
+ if (!inode)
+ goto out;
+
+ /* Update modification times of quota files when userspace can
+ * start looking at them */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ goto out;
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+out:
+ return dquot_quota_off(sb, type);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and no one else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+ int err = 0;
+ int offset = off & (sb->s_blocksize - 1);
+ int tocopy;
+ size_t toread;
+ struct buffer_head *bh;
+ loff_t i_size = i_size_read(inode);
+
+ if (off > i_size)
+ return 0;
+ if (off+len > i_size)
+ len = i_size-off;
+ toread = len;
+ while (toread > 0) {
+ tocopy = sb->s_blocksize - offset < toread ?
+ sb->s_blocksize - offset : toread;
+ bh = ext4_bread(NULL, inode, blk, 0, &err);
+ if (err)
+ return err;
+ if (!bh) /* A hole? */
+ memset(data, 0, tocopy);
+ else
+ memcpy(data, bh->b_data+offset, tocopy);
+ brelse(bh);
+ offset = 0;
+ toread -= tocopy;
+ data += tocopy;
+ blk++;
+ }
+ return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+ int err = 0;
+ int offset = off & (sb->s_blocksize - 1);
+ struct buffer_head *bh;
+ handle_t *handle = journal_current_handle();
+
+ if (EXT4_SB(sb)->s_journal && !handle) {
+ ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because transaction is not started",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+ /*
+ * Since we account only one data block in transaction credits,
+ * then it is impossible to cross a block boundary.
+ */
+ if (sb->s_blocksize - offset < len) {
+ ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because not block aligned",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+ bh = ext4_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
+ goto out;
+ }
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, len);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ brelse(bh);
+out:
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ return err;
+ }
+ if (inode->i_size < off + len) {
+ i_size_write(inode, off + len);
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ mutex_unlock(&inode->i_mutex);
+ return len;
+}
+
+#endif
+
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
+}
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext2(void)
+{
+ int err = register_filesystem(&ext2_fs_type);
+ if (err)
+ printk(KERN_WARNING
+ "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
+{
+ unregister_filesystem(&ext2_fs_type);
+}
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
+MODULE_ALIAS("ext2");
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext3(void)
+{
+ int err = register_filesystem(&ext3_fs_type);
+ if (err)
+ printk(KERN_WARNING
+ "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+ unregister_filesystem(&ext3_fs_type);
+}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
+MODULE_ALIAS("ext3");
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+static struct file_system_type ext4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext4",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init ext4_init_feat_adverts(void)
+{
+ struct ext4_features *ef;
+ int ret = -ENOMEM;
+
+ ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+ if (!ef)
+ goto out;
+
+ ef->f_kobj.kset = ext4_kset;
+ init_completion(&ef->f_kobj_unregister);
+ ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+ "features");
+ if (ret) {
+ kfree(ef);
+ goto out;
+ }
+
+ ext4_feat = ef;
+ ret = 0;
+out:
+ return ret;
+}
+
+static void ext4_exit_feat_adverts(void)
+{
+ kobject_put(&ext4_feat->f_kobj);
+ wait_for_completion(&ext4_feat->f_kobj_unregister);
+ kfree(ext4_feat);
+}
+
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+static int __init ext4_init_fs(void)
+{
+ int i, err;
+
+ ext4_li_info = NULL;
+ mutex_init(&ext4_li_mtx);
+
+ ext4_check_flag_values();
+
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+ mutex_init(&ext4__aio_mutex[i]);
+ init_waitqueue_head(&ext4__ioend_wq[i]);
+ }
+
+ err = ext4_init_pageio();
+ if (err)
+ return err;
+ err = ext4_init_system_zone();
+ if (err)
+ goto out6;
+ ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+ if (!ext4_kset)
+ goto out5;
+ ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+
+ err = ext4_init_feat_adverts();
+ if (err)
+ goto out4;
+
+ err = ext4_init_mballoc();
+ if (err)
+ goto out3;
+
+ err = ext4_init_xattr();
+ if (err)
+ goto out2;
+ err = init_inodecache();
+ if (err)
+ goto out1;
+ register_as_ext3();
+ register_as_ext2();
+ err = register_filesystem(&ext4_fs_type);
+ if (err)
+ goto out;
+
+ return 0;
+out:
+ unregister_as_ext2();
+ unregister_as_ext3();
+ destroy_inodecache();
+out1:
+ ext4_exit_xattr();
+out2:
+ ext4_exit_mballoc();
+out3:
+ ext4_exit_feat_adverts();
+out4:
+ if (ext4_proc_root)
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+out5:
+ ext4_exit_system_zone();
+out6:
+ ext4_exit_pageio();
+ return err;
+}
+
+static void __exit ext4_exit_fs(void)
+{
+ ext4_destroy_lazyinit_thread();
+ unregister_as_ext2();
+ unregister_as_ext3();
+ unregister_filesystem(&ext4_fs_type);
+ destroy_inodecache();
+ ext4_exit_xattr();
+ ext4_exit_mballoc();
+ ext4_exit_feat_adverts();
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+ ext4_exit_system_zone();
+ ext4_exit_pageio();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
+MODULE_LICENSE("GPL");
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 00000000..ed9354af
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,56 @@
+/*
+ * linux/fs/ext4/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/symlink.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/namei.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
+ nd_set_link(nd, (char *) ei->i_data);
+ return NULL;
+}
+
+const struct inode_operations ext4_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations ext4_fast_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = ext4_follow_link,
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 00000000..011ba667
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/ext4/truncate.h
+ *
+ * Common inline functions needed for truncate support
+ */
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static inline void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
+{
+ ext4_lblk_t needed;
+
+ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+ /* Give ourselves just enough room to cope with inodes in which
+ * i_blocks is corrupt: we've seen disk corruptions in the past
+ * which resulted in random data in an inode which looked enough
+ * like a regular file for ext4 to try to delete it. Things
+ * will go a bit crazy if that happens, but at least we should
+ * try not to panic the whole kernel. */
+ if (needed < 2)
+ needed = 2;
+
+ /* But we need to bound the transaction so we don't overflow the
+ * journal. */
+ if (needed > EXT4_MAX_TRANS_DATA)
+ needed = EXT4_MAX_TRANS_DATA;
+
+ return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 00000000..e88748e5
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1608 @@
+/*
+ * linux/fs/ext4/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ * suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ * Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ * and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ * +------------------+
+ * | header |
+ * | entry 1 | |
+ * | entry 2 | | growing downwards
+ * | entry 3 | v
+ * | four null bytes |
+ * | . . . |
+ * | value 1 | ^
+ * | value 3 | | growing upwards
+ * | value 2 | |
+ * +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#ifdef EXT4_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+ printk(KERN_DEBUG "inode %s:%lu: ", \
+ inode->i_sb->s_id, inode->i_ino); \
+ printk(f); \
+ printk("\n"); \
+ } while (0)
+# define ea_bdebug(bh, f...) do { \
+ char b[BDEVNAME_SIZE]; \
+ printk(KERN_DEBUG "block %s:%lu: ", \
+ bdevname(bh->b_bdev, b), \
+ (unsigned long) bh->b_blocknr); \
+ printk(f); \
+ printk("\n"); \
+ } while (0)
+#else
+# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+static void ext4_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+ struct ext4_xattr_header *,
+ struct mb_cache_entry **);
+static void ext4_xattr_rehash(struct ext4_xattr_header *,
+ struct ext4_xattr_entry *);
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
+
+static struct mb_cache *ext4_xattr_cache;
+
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
+ [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#endif
+ [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4_FS_SECURITY
+ [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
+#endif
+};
+
+const struct xattr_handler *ext4_xattr_handlers[] = {
+ &ext4_xattr_user_handler,
+ &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ &ext4_xattr_acl_access_handler,
+ &ext4_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT4_FS_SECURITY
+ &ext4_xattr_security_handler,
+#endif
+ NULL
+};
+
+static inline const struct xattr_handler *
+ext4_xattr_handler(int name_index)
+{
+ const struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
+ handler = ext4_xattr_handler_map[name_index];
+ return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ return ext4_xattr_list(dentry, buffer, size);
+}
+
+static int
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+{
+ while (!IS_LAST_ENTRY(entry)) {
+ struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+ if ((void *)next >= end)
+ return -EIO;
+ entry = next;
+ }
+ return 0;
+}
+
+static inline int
+ext4_xattr_check_block(struct buffer_head *bh)
+{
+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+ BHDR(bh)->h_blocks != cpu_to_le32(1))
+ return -EIO;
+ return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+}
+
+static inline int
+ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
+{
+ size_t value_size = le32_to_cpu(entry->e_value_size);
+
+ if (entry->e_value_block != 0 || value_size > size ||
+ le16_to_cpu(entry->e_value_offs) + value_size > size)
+ return -EIO;
+ return 0;
+}
+
+static int
+ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+ const char *name, size_t size, int sorted)
+{
+ struct ext4_xattr_entry *entry;
+ size_t name_len;
+ int cmp = 1;
+
+ if (name == NULL)
+ return -EINVAL;
+ name_len = strlen(name);
+ entry = *pentry;
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ cmp = name_index - entry->e_name_index;
+ if (!cmp)
+ cmp = name_len - entry->e_name_len;
+ if (!cmp)
+ cmp = memcmp(name, entry->e_name, name_len);
+ if (cmp <= 0 && (sorted || cmp == 0))
+ break;
+ }
+ *pentry = entry;
+ if (!cmp && ext4_xattr_check_entry(entry, size))
+ return -EIO;
+ return cmp ? -ENODATA : 0;
+}
+
+static int
+ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ struct buffer_head *bh = NULL;
+ struct ext4_xattr_entry *entry;
+ size_t size;
+ int error;
+
+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+ name_index, name, buffer, (long)buffer_size);
+
+ error = -ENODATA;
+ if (!EXT4_I(inode)->i_file_acl)
+ goto cleanup;
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh)
+ goto cleanup;
+ ea_bdebug(bh, "b_count=%d, refcount=%d",
+ atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+ if (ext4_xattr_check_block(bh)) {
+bad_block:
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ ext4_xattr_cache_insert(bh);
+ entry = BFIRST(bh);
+ error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+ if (error == -EIO)
+ goto bad_block;
+ if (error)
+ goto cleanup;
+ size = le32_to_cpu(entry->e_value_size);
+ if (buffer) {
+ error = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+ size);
+ }
+ error = size;
+
+cleanup:
+ brelse(bh);
+ return error;
+}
+
+static int
+ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+ size_t size;
+ void *end;
+ int error;
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return -ENODATA;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ error = ext4_xattr_check_names(entry, end);
+ if (error)
+ goto cleanup;
+ error = ext4_xattr_find_entry(&entry, name_index, name,
+ end - (void *)entry, 0);
+ if (error)
+ goto cleanup;
+ size = le32_to_cpu(entry->e_value_size);
+ if (buffer) {
+ error = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+ memcpy(buffer, (void *)IFIRST(header) +
+ le16_to_cpu(entry->e_value_offs), size);
+ }
+ error = size;
+
+cleanup:
+ brelse(iloc.bh);
+ return error;
+}
+
+/*
+ * ext4_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ int error;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
+ buffer_size);
+ if (error == -ENODATA)
+ error = ext4_xattr_block_get(inode, name_index, name, buffer,
+ buffer_size);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
+static int
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
+ char *buffer, size_t buffer_size)
+{
+ size_t rest = buffer_size;
+
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ const struct xattr_handler *handler =
+ ext4_xattr_handler(entry->e_name_index);
+
+ if (handler) {
+ size_t size = handler->list(dentry, buffer, rest,
+ entry->e_name,
+ entry->e_name_len,
+ handler->flags);
+ if (buffer) {
+ if (size > rest)
+ return -ERANGE;
+ buffer += size;
+ }
+ rest -= size;
+ }
+ }
+ return buffer_size - rest;
+}
+
+static int
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct buffer_head *bh = NULL;
+ int error;
+
+ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+ buffer, (long)buffer_size);
+
+ error = 0;
+ if (!EXT4_I(inode)->i_file_acl)
+ goto cleanup;
+ ea_idebug(inode, "reading block %llu",
+ (unsigned long long)EXT4_I(inode)->i_file_acl);
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ if (!bh)
+ goto cleanup;
+ ea_bdebug(bh, "b_count=%d, refcount=%d",
+ atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+ if (ext4_xattr_check_block(bh)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ ext4_xattr_cache_insert(bh);
+ error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+ brelse(bh);
+
+ return error;
+}
+
+static int
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+ void *end;
+ int error;
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return 0;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ error = ext4_xattr_check_names(IFIRST(header), end);
+ if (error)
+ goto cleanup;
+ error = ext4_xattr_list_entries(dentry, IFIRST(header),
+ buffer, buffer_size);
+
+cleanup:
+ brelse(iloc.bh);
+ return error;
+}
+
+/*
+ * ext4_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+static int
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ int ret, ret2;
+
+ down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+ if (ret < 0)
+ goto errout;
+ if (buffer) {
+ buffer += ret;
+ buffer_size -= ret;
+ }
+ ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+ if (ret < 0)
+ goto errout;
+ ret += ret2;
+errout:
+ up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ return ret;
+}
+
+/*
+ * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext4_xattr_update_super_block(handle_t *handle,
+ struct super_block *sb)
+{
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+ return;
+
+ if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+ EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+ ext4_handle_dirty_super(handle, sb);
+ }
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct mb_cache_entry *ce = NULL;
+ int error = 0;
+
+ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ error = ext4_journal_get_write_access(handle, bh);
+ if (error)
+ goto out;
+
+ lock_buffer(bh);
+ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+ ea_bdebug(bh, "refcount now=0; freeing");
+ if (ce)
+ mb_cache_entry_free(ce);
+ get_bh(bh);
+ ext4_free_blocks(handle, inode, bh, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ unlock_buffer(bh);
+ } else {
+ le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+ if (ce)
+ mb_cache_entry_release(ce);
+ unlock_buffer(bh);
+ error = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ dquot_free_block(inode, 1);
+ ea_bdebug(bh, "refcount now=%d; releasing",
+ le32_to_cpu(BHDR(bh)->h_refcount));
+ }
+out:
+ ext4_std_error(inode->i_sb, error);
+ return;
+}
+
+/*
+ * Find the available free space for EAs. This also returns the total number of
+ * bytes used by EA entries.
+ */
+static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+ size_t *min_offs, void *base, int *total)
+{
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ *total += EXT4_XATTR_LEN(last->e_name_len);
+ if (!last->e_value_block && last->e_value_size) {
+ size_t offs = le16_to_cpu(last->e_value_offs);
+ if (offs < *min_offs)
+ *min_offs = offs;
+ }
+ }
+ return (*min_offs - ((void *)last - base) - sizeof(__u32));
+}
+
+struct ext4_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ext4_xattr_search {
+ struct ext4_xattr_entry *first;
+ void *base;
+ void *end;
+ struct ext4_xattr_entry *here;
+ int not_found;
+};
+
+static int
+ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+{
+ struct ext4_xattr_entry *last;
+ size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+ /* Compute min_offs and last. */
+ last = s->first;
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ if (!last->e_value_block && last->e_value_size) {
+ size_t offs = le16_to_cpu(last->e_value_offs);
+ if (offs < min_offs)
+ min_offs = offs;
+ }
+ }
+ free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+ if (!s->not_found) {
+ if (!s->here->e_value_block && s->here->e_value_size) {
+ size_t size = le32_to_cpu(s->here->e_value_size);
+ free += EXT4_XATTR_SIZE(size);
+ }
+ free += EXT4_XATTR_LEN(name_len);
+ }
+ if (i->value) {
+ if (free < EXT4_XATTR_SIZE(i->value_len) ||
+ free < EXT4_XATTR_LEN(name_len) +
+ EXT4_XATTR_SIZE(i->value_len))
+ return -ENOSPC;
+ }
+
+ if (i->value && s->not_found) {
+ /* Insert the new name. */
+ size_t size = EXT4_XATTR_LEN(name_len);
+ size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+ memmove((void *)s->here + size, s->here, rest);
+ memset(s->here, 0, size);
+ s->here->e_name_index = i->name_index;
+ s->here->e_name_len = name_len;
+ memcpy(s->here->e_name, i->name, name_len);
+ } else {
+ if (!s->here->e_value_block && s->here->e_value_size) {
+ void *first_val = s->base + min_offs;
+ size_t offs = le16_to_cpu(s->here->e_value_offs);
+ void *val = s->base + offs;
+ size_t size = EXT4_XATTR_SIZE(
+ le32_to_cpu(s->here->e_value_size));
+
+ if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
+ /* The old and the new value have the same
+ size. Just replace. */
+ s->here->e_value_size =
+ cpu_to_le32(i->value_len);
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD); /* Clear pad bytes. */
+ memcpy(val, i->value, i->value_len);
+ return 0;
+ }
+
+ /* Remove the old value. */
+ memmove(first_val + size, first_val, val - first_val);
+ memset(first_val, 0, size);
+ s->here->e_value_size = 0;
+ s->here->e_value_offs = 0;
+ min_offs += size;
+
+ /* Adjust all value offsets. */
+ last = s->first;
+ while (!IS_LAST_ENTRY(last)) {
+ size_t o = le16_to_cpu(last->e_value_offs);
+ if (!last->e_value_block &&
+ last->e_value_size && o < offs)
+ last->e_value_offs =
+ cpu_to_le16(o + size);
+ last = EXT4_XATTR_NEXT(last);
+ }
+ }
+ if (!i->value) {
+ /* Remove the old name. */
+ size_t size = EXT4_XATTR_LEN(name_len);
+ last = ENTRY((void *)last - size);
+ memmove(s->here, (void *)s->here + size,
+ (void *)last - (void *)s->here + sizeof(__u32));
+ memset(last, 0, size);
+ }
+ }
+
+ if (i->value) {
+ /* Insert the new value. */
+ s->here->e_value_size = cpu_to_le32(i->value_len);
+ if (i->value_len) {
+ size_t size = EXT4_XATTR_SIZE(i->value_len);
+ void *val = s->base + min_offs - size;
+ s->here->e_value_offs = cpu_to_le16(min_offs - size);
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD); /* Clear the pad bytes. */
+ memcpy(val, i->value, i->value_len);
+ }
+ }
+ return 0;
+}
+
+struct ext4_xattr_block_find {
+ struct ext4_xattr_search s;
+ struct buffer_head *bh;
+};
+
+static int
+ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_block_find *bs)
+{
+ struct super_block *sb = inode->i_sb;
+ int error;
+
+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+ i->name_index, i->name, i->value, (long)i->value_len);
+
+ if (EXT4_I(inode)->i_file_acl) {
+ /* The inode already has an extended attribute block. */
+ bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ if (!bs->bh)
+ goto cleanup;
+ ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+ atomic_read(&(bs->bh->b_count)),
+ le32_to_cpu(BHDR(bs->bh)->h_refcount));
+ if (ext4_xattr_check_block(bs->bh)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ /* Find the named attribute. */
+ bs->s.base = BHDR(bs->bh);
+ bs->s.first = BFIRST(bs->bh);
+ bs->s.end = bs->bh->b_data + bs->bh->b_size;
+ bs->s.here = bs->s.first;
+ error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+ i->name, bs->bh->b_size, 1);
+ if (error && error != -ENODATA)
+ goto cleanup;
+ bs->s.not_found = error;
+ }
+ error = 0;
+
+cleanup:
+ return error;
+}
+
+static int
+ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_block_find *bs)
+{
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *new_bh = NULL;
+ struct ext4_xattr_search *s = &bs->s;
+ struct mb_cache_entry *ce = NULL;
+ int error = 0;
+
+#define header(x) ((struct ext4_xattr_header *)(x))
+
+ if (i->value && i->value_len > sb->s_blocksize)
+ return -ENOSPC;
+ if (s->base) {
+ ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+ bs->bh->b_blocknr);
+ error = ext4_journal_get_write_access(handle, bs->bh);
+ if (error)
+ goto cleanup;
+ lock_buffer(bs->bh);
+
+ if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+ if (ce) {
+ mb_cache_entry_free(ce);
+ ce = NULL;
+ }
+ ea_bdebug(bs->bh, "modifying in-place");
+ error = ext4_xattr_set_entry(i, s);
+ if (!error) {
+ if (!IS_LAST_ENTRY(s->first))
+ ext4_xattr_rehash(header(s->base),
+ s->here);
+ ext4_xattr_cache_insert(bs->bh);
+ }
+ unlock_buffer(bs->bh);
+ if (error == -EIO)
+ goto bad_block;
+ if (!error)
+ error = ext4_handle_dirty_metadata(handle,
+ inode,
+ bs->bh);
+ if (error)
+ goto cleanup;
+ goto inserted;
+ } else {
+ int offset = (char *)s->here - bs->bh->b_data;
+
+ unlock_buffer(bs->bh);
+ ext4_handle_release_buffer(handle, bs->bh);
+ if (ce) {
+ mb_cache_entry_release(ce);
+ ce = NULL;
+ }
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
+ }
+ } else {
+ /* Allocate a buffer where we construct the new block. */
+ s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
+ /* assert(header == s->base) */
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ header(s->base)->h_blocks = cpu_to_le32(1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->first = ENTRY(header(s->base)+1);
+ s->here = ENTRY(header(s->base)+1);
+ s->end = s->base + sb->s_blocksize;
+ }
+
+ error = ext4_xattr_set_entry(i, s);
+ if (error == -EIO)
+ goto bad_block;
+ if (error)
+ goto cleanup;
+ if (!IS_LAST_ENTRY(s->first))
+ ext4_xattr_rehash(header(s->base), s->here);
+
+inserted:
+ if (!IS_LAST_ENTRY(s->first)) {
+ new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+ if (new_bh) {
+ /* We found an identical block in the cache. */
+ if (new_bh == bs->bh)
+ ea_bdebug(new_bh, "keeping");
+ else {
+ /* The old block is released after updating
+ the inode. */
+ error = dquot_alloc_block(inode, 1);
+ if (error)
+ goto cleanup;
+ error = ext4_journal_get_write_access(handle,
+ new_bh);
+ if (error)
+ goto cleanup_dquot;
+ lock_buffer(new_bh);
+ le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+ ea_bdebug(new_bh, "reusing; refcount now=%d",
+ le32_to_cpu(BHDR(new_bh)->h_refcount));
+ unlock_buffer(new_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode,
+ new_bh);
+ if (error)
+ goto cleanup_dquot;
+ }
+ mb_cache_entry_release(ce);
+ ce = NULL;
+ } else if (bs->bh && s->base == bs->bh->b_data) {
+ /* We were modifying this block in-place. */
+ ea_bdebug(bs->bh, "keeping this block");
+ new_bh = bs->bh;
+ get_bh(new_bh);
+ } else {
+ /* We need to allocate a new block */
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
+ EXT4_I(inode)->i_block_group);
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ /*
+ * take i_data_sem because we will test
+ * i_delalloc_reserved_flag in ext4_mb_new_blocks
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+ block = ext4_new_meta_blocks(handle, inode, goal, 0,
+ NULL, &error);
+ up_read((&EXT4_I(inode)->i_data_sem));
+ if (error)
+ goto cleanup;
+
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
+ ea_idebug(inode, "creating block %llu",
+ (unsigned long long)block);
+
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+getblk_failed:
+ ext4_free_blocks(handle, inode, NULL, block, 1,
+ EXT4_FREE_BLOCKS_METADATA);
+ error = -EIO;
+ goto cleanup;
+ }
+ lock_buffer(new_bh);
+ error = ext4_journal_get_create_access(handle, new_bh);
+ if (error) {
+ unlock_buffer(new_bh);
+ goto getblk_failed;
+ }
+ memcpy(new_bh->b_data, s->base, new_bh->b_size);
+ set_buffer_uptodate(new_bh);
+ unlock_buffer(new_bh);
+ ext4_xattr_cache_insert(new_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, new_bh);
+ if (error)
+ goto cleanup;
+ }
+ }
+
+ /* Update the inode. */
+ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+ /* Drop the previous xattr block. */
+ if (bs->bh && bs->bh != new_bh)
+ ext4_xattr_release_block(handle, inode, bs->bh);
+ error = 0;
+
+cleanup:
+ if (ce)
+ mb_cache_entry_release(ce);
+ brelse(new_bh);
+ if (!(bs->bh && s->base == bs->bh->b_data))
+ kfree(s->base);
+
+ return error;
+
+cleanup_dquot:
+ dquot_free_block(inode, 1);
+ goto cleanup;
+
+bad_block:
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
+ goto cleanup;
+
+#undef header
+}
+
+struct ext4_xattr_ibody_find {
+ struct ext4_xattr_search s;
+ struct ext4_iloc iloc;
+};
+
+static int
+ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+ raw_inode = ext4_raw_inode(&is->iloc);
+ header = IHDR(inode, raw_inode);
+ is->s.base = is->s.first = IFIRST(header);
+ is->s.here = is->s.first;
+ is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+ if (error)
+ return error;
+ /* Find the named attribute. */
+ error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+ i->name, is->s.end -
+ (void *)is->s.base, 0);
+ if (error && error != -ENODATA)
+ return error;
+ is->s.not_found = error;
+ }
+ return 0;
+}
+
+static int
+ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return -ENOSPC;
+ error = ext4_xattr_set_entry(i, s);
+ if (error)
+ return error;
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ } else {
+ header->h_magic = cpu_to_le32(0);
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ return 0;
+}
+
+/*
+ * ext4_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode. Value
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+ const char *name, const void *value, size_t value_len,
+ int flags)
+{
+ struct ext4_xattr_info i = {
+ .name_index = name_index,
+ .name = name,
+ .value = value,
+ .value_len = value_len,
+
+ };
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_block_find bs = {
+ .s = { .not_found = -ENODATA, },
+ };
+ unsigned long no_expand;
+ int error;
+
+ if (!name)
+ return -EINVAL;
+ if (strlen(name) > 255)
+ return -ERANGE;
+ down_write(&EXT4_I(inode)->xattr_sem);
+ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+
+ error = ext4_reserve_inode_write(handle, inode, &is.iloc);
+ if (error)
+ goto cleanup;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
+ struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+ }
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto cleanup;
+ if (is.s.not_found)
+ error = ext4_xattr_block_find(inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ if (is.s.not_found && bs.s.not_found) {
+ error = -ENODATA;
+ if (flags & XATTR_REPLACE)
+ goto cleanup;
+ error = 0;
+ if (!value)
+ goto cleanup;
+ } else {
+ error = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto cleanup;
+ }
+ if (!value) {
+ if (!is.s.not_found)
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ else if (!bs.s.not_found)
+ error = ext4_xattr_block_set(handle, inode, &i, &bs);
+ } else {
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ if (!error && !bs.s.not_found) {
+ i.value = NULL;
+ error = ext4_xattr_block_set(handle, inode, &i, &bs);
+ } else if (error == -ENOSPC) {
+ if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+ error = ext4_xattr_block_find(inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ }
+ error = ext4_xattr_block_set(handle, inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ if (!is.s.not_found) {
+ i.value = NULL;
+ error = ext4_xattr_ibody_set(handle, inode, &i,
+ &is);
+ }
+ }
+ }
+ if (!error) {
+ ext4_xattr_update_super_block(handle, inode->i_sb);
+ inode->i_ctime = ext4_current_time(inode);
+ if (!value)
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+ /*
+ * The bh is consumed by ext4_mark_iloc_dirty, even with
+ * error != 0.
+ */
+ is.iloc.bh = NULL;
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ }
+
+cleanup:
+ brelse(is.iloc.bh);
+ brelse(bs.bh);
+ if (no_expand == 0)
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
+/*
+ * ext4_xattr_set()
+ *
+ * Like ext4_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t value_len, int flags)
+{
+ handle_t *handle;
+ int error, retries = 0;
+
+retry:
+ handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ } else {
+ int error2;
+
+ error = ext4_xattr_set_handle(handle, inode, name_index, name,
+ value, value_len, flags);
+ error2 = ext4_journal_stop(handle);
+ if (error == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (error == 0)
+ error = error2;
+ }
+
+ return error;
+}
+
+/*
+ * Shift the EA entries in the inode to create space for the increased
+ * i_extra_isize.
+ */
+static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+ int value_offs_shift, void *to,
+ void *from, size_t n, int blocksize)
+{
+ struct ext4_xattr_entry *last = entry;
+ int new_offs;
+
+ /* Adjust the value offsets of the entries */
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ if (!last->e_value_block && last->e_value_size) {
+ new_offs = le16_to_cpu(last->e_value_offs) +
+ value_offs_shift;
+ BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+ > blocksize);
+ last->e_value_offs = cpu_to_le16(new_offs);
+ }
+ }
+ /* Shift the entries by n bytes */
+ memmove(to, from, n);
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes when EAs are present.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry, *last, *first;
+ struct buffer_head *bh = NULL;
+ struct ext4_xattr_ibody_find *is = NULL;
+ struct ext4_xattr_block_find *bs = NULL;
+ char *buffer = NULL, *b_entry_name = NULL;
+ size_t min_offs, free;
+ int total_ino, total_blk;
+ void *base, *start, *end;
+ int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+ int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+retry:
+ if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return 0;
+ }
+
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /*
+ * Check if enough free space is available in the inode to shift the
+ * entries ahead by new_extra_isize.
+ */
+
+ base = start = entry;
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ min_offs = end - base;
+ last = entry;
+ total_ino = sizeof(struct ext4_xattr_ibody_header);
+
+ free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+ if (free >= new_extra_isize) {
+ entry = IFIRST(header);
+ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
+ - new_extra_isize, (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+ (void *)header, total_ino,
+ inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ error = 0;
+ goto cleanup;
+ }
+
+ /*
+ * Enough free space isn't available in the inode, check if
+ * EA block can hold new_extra_isize bytes.
+ */
+ if (EXT4_I(inode)->i_file_acl) {
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ if (!bh)
+ goto cleanup;
+ if (ext4_xattr_check_block(bh)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ base = BHDR(bh);
+ first = BFIRST(bh);
+ end = bh->b_data + bh->b_size;
+ min_offs = end - base;
+ free = ext4_xattr_free_space(first, &min_offs, base,
+ &total_blk);
+ if (free < new_extra_isize) {
+ if (!tried_min_extra_isize && s_min_extra_isize) {
+ tried_min_extra_isize++;
+ new_extra_isize = s_min_extra_isize;
+ brelse(bh);
+ goto retry;
+ }
+ error = -1;
+ goto cleanup;
+ }
+ } else {
+ free = inode->i_sb->s_blocksize;
+ }
+
+ while (new_extra_isize > 0) {
+ size_t offs, size, entry_size;
+ struct ext4_xattr_entry *small_entry = NULL;
+ struct ext4_xattr_info i = {
+ .value = NULL,
+ .value_len = 0,
+ };
+ unsigned int total_size; /* EA entry size + value size */
+ unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
+ unsigned int min_total_size = ~0U;
+
+ is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+ bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
+ if (!is || !bs) {
+ error = -ENOMEM;
+ goto cleanup;
+ }
+
+ is->s.not_found = -ENODATA;
+ bs->s.not_found = -ENODATA;
+ is->iloc.bh = NULL;
+ bs->bh = NULL;
+
+ last = IFIRST(header);
+ /* Find the entry best suited to be pushed into EA block */
+ entry = NULL;
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ total_size =
+ EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+ EXT4_XATTR_LEN(last->e_name_len);
+ if (total_size <= free && total_size < min_total_size) {
+ if (total_size < new_extra_isize) {
+ small_entry = last;
+ } else {
+ entry = last;
+ min_total_size = total_size;
+ }
+ }
+ }
+
+ if (entry == NULL) {
+ if (small_entry) {
+ entry = small_entry;
+ } else {
+ if (!tried_min_extra_isize &&
+ s_min_extra_isize) {
+ tried_min_extra_isize++;
+ new_extra_isize = s_min_extra_isize;
+ goto retry;
+ }
+ error = -1;
+ goto cleanup;
+ }
+ }
+ offs = le16_to_cpu(entry->e_value_offs);
+ size = le32_to_cpu(entry->e_value_size);
+ entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+ i.name_index = entry->e_name_index,
+ buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+ b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
+ if (!buffer || !b_entry_name) {
+ error = -ENOMEM;
+ goto cleanup;
+ }
+ /* Save the entry name and the entry value */
+ memcpy(buffer, (void *)IFIRST(header) + offs,
+ EXT4_XATTR_SIZE(size));
+ memcpy(b_entry_name, entry->e_name, entry->e_name_len);
+ b_entry_name[entry->e_name_len] = '\0';
+ i.name = b_entry_name;
+
+ error = ext4_get_inode_loc(inode, &is->iloc);
+ if (error)
+ goto cleanup;
+
+ error = ext4_xattr_ibody_find(inode, &i, is);
+ if (error)
+ goto cleanup;
+
+ /* Remove the chosen entry from the inode */
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
+ if (error)
+ goto cleanup;
+
+ entry = IFIRST(header);
+ if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+ shift_bytes = new_extra_isize;
+ else
+ shift_bytes = entry_size + size;
+ /* Adjust the offsets and shift the remaining entries ahead */
+ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+ shift_bytes, (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+ (void *)header, total_ino - entry_size,
+ inode->i_sb->s_blocksize);
+
+ extra_isize += shift_bytes;
+ new_extra_isize -= shift_bytes;
+ EXT4_I(inode)->i_extra_isize = extra_isize;
+
+ i.name = b_entry_name;
+ i.value = buffer;
+ i.value_len = size;
+ error = ext4_xattr_block_find(inode, &i, bs);
+ if (error)
+ goto cleanup;
+
+ /* Add entry which was removed from the inode into the block */
+ error = ext4_xattr_block_set(handle, inode, &i, bs);
+ if (error)
+ goto cleanup;
+ kfree(b_entry_name);
+ kfree(buffer);
+ b_entry_name = NULL;
+ buffer = NULL;
+ brelse(is->iloc.bh);
+ kfree(is);
+ kfree(bs);
+ }
+ brelse(bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return 0;
+
+cleanup:
+ kfree(b_entry_name);
+ kfree(buffer);
+ if (is)
+ brelse(is->iloc.bh);
+ kfree(is);
+ kfree(bs);
+ brelse(bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
+
+
+/*
+ * ext4_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+ struct buffer_head *bh = NULL;
+
+ if (!EXT4_I(inode)->i_file_acl)
+ goto cleanup;
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
+ goto cleanup;
+ }
+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+ BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
+ goto cleanup;
+ }
+ ext4_xattr_release_block(handle, inode, bh);
+ EXT4_I(inode)->i_file_acl = 0;
+
+cleanup:
+ brelse(bh);
+}
+
+/*
+ * ext4_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext4_xattr_put_super(struct super_block *sb)
+{
+ mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext4_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext4_xattr_cache_insert(struct buffer_head *bh)
+{
+ __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+ struct mb_cache_entry *ce;
+ int error;
+
+ ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+ if (!ce) {
+ ea_bdebug(bh, "out of memory");
+ return;
+ }
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+ if (error) {
+ mb_cache_entry_free(ce);
+ if (error == -EBUSY) {
+ ea_bdebug(bh, "already in cache");
+ error = 0;
+ }
+ } else {
+ ea_bdebug(bh, "inserting [%x]", (int)hash);
+ mb_cache_entry_release(ce);
+ }
+}
+
+/*
+ * ext4_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext4_xattr_cmp(struct ext4_xattr_header *header1,
+ struct ext4_xattr_header *header2)
+{
+ struct ext4_xattr_entry *entry1, *entry2;
+
+ entry1 = ENTRY(header1+1);
+ entry2 = ENTRY(header2+1);
+ while (!IS_LAST_ENTRY(entry1)) {
+ if (IS_LAST_ENTRY(entry2))
+ return 1;
+ if (entry1->e_hash != entry2->e_hash ||
+ entry1->e_name_index != entry2->e_name_index ||
+ entry1->e_name_len != entry2->e_name_len ||
+ entry1->e_value_size != entry2->e_value_size ||
+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+ return 1;
+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+ return -EIO;
+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+ le32_to_cpu(entry1->e_value_size)))
+ return 1;
+
+ entry1 = EXT4_XATTR_NEXT(entry1);
+ entry2 = EXT4_XATTR_NEXT(entry2);
+ }
+ if (!IS_LAST_ENTRY(entry2))
+ return 1;
+ return 0;
+}
+
+/*
+ * ext4_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+ struct mb_cache_entry **pce)
+{
+ __u32 hash = le32_to_cpu(header->h_hash);
+ struct mb_cache_entry *ce;
+
+ if (!header->h_hash)
+ return NULL; /* never share */
+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+ ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+ hash);
+ while (ce) {
+ struct buffer_head *bh;
+
+ if (IS_ERR(ce)) {
+ if (PTR_ERR(ce) == -EAGAIN)
+ goto again;
+ break;
+ }
+ bh = sb_bread(inode->i_sb, ce->e_block);
+ if (!bh) {
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long) ce->e_block);
+ } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+ EXT4_XATTR_REFCOUNT_MAX) {
+ ea_idebug(inode, "block %lu refcount %d>=%d",
+ (unsigned long) ce->e_block,
+ le32_to_cpu(BHDR(bh)->h_refcount),
+ EXT4_XATTR_REFCOUNT_MAX);
+ } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
+ *pce = ce;
+ return bh;
+ }
+ brelse(bh);
+ ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+ }
+ return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+ struct ext4_xattr_entry *entry)
+{
+ __u32 hash = 0;
+ char *name = entry->e_name;
+ int n;
+
+ for (n = 0; n < entry->e_name_len; n++) {
+ hash = (hash << NAME_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+ *name++;
+ }
+
+ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+ __le32 *value = (__le32 *)((char *)header +
+ le16_to_cpu(entry->e_value_offs));
+ for (n = (le32_to_cpu(entry->e_value_size) +
+ EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
+ hash = (hash << VALUE_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+ le32_to_cpu(*value++);
+ }
+ }
+ entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+ struct ext4_xattr_entry *entry)
+{
+ struct ext4_xattr_entry *here;
+ __u32 hash = 0;
+
+ ext4_xattr_hash_entry(header, entry);
+ here = ENTRY(header+1);
+ while (!IS_LAST_ENTRY(here)) {
+ if (!here->e_hash) {
+ /* Block is not shared if an entry's hash value == 0 */
+ hash = 0;
+ break;
+ }
+ hash = (hash << BLOCK_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+ le32_to_cpu(here->e_hash);
+ here = EXT4_XATTR_NEXT(here);
+ }
+ header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+ext4_init_xattr(void)
+{
+ ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
+ if (!ext4_xattr_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+ext4_exit_xattr(void)
+{
+ if (ext4_xattr_cache)
+ mb_cache_destroy(ext4_xattr_cache);
+ ext4_xattr_cache = NULL;
+}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 00000000..25b7387f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,155 @@
+/*
+ File: fs/ext4/xattr.h
+
+ On-disk format of extended attributes for the ext4 filesystem.
+
+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT4_XATTR_MAGIC 0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT4_XATTR_REFCOUNT_MAX 1024
+
+/* Name indexes */
+#define EXT4_XATTR_INDEX_USER 1
+#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2
+#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define EXT4_XATTR_INDEX_TRUSTED 4
+#define EXT4_XATTR_INDEX_LUSTRE 5
+#define EXT4_XATTR_INDEX_SECURITY 6
+
+struct ext4_xattr_header {
+ __le32 h_magic; /* magic number for identification */
+ __le32 h_refcount; /* reference count */
+ __le32 h_blocks; /* number of disk blocks used */
+ __le32 h_hash; /* hash value of all attributes */
+ __u32 h_reserved[4]; /* zero right now */
+};
+
+struct ext4_xattr_ibody_header {
+ __le32 h_magic; /* magic number for identification */
+};
+
+struct ext4_xattr_entry {
+ __u8 e_name_len; /* length of name */
+ __u8 e_name_index; /* attribute name index */
+ __le16 e_value_offs; /* offset in disk block of value */
+ __le32 e_value_block; /* disk block attribute is stored on (n/i) */
+ __le32 e_value_size; /* size of attribute value */
+ __le32 e_hash; /* hash value of name and value */
+ char e_name[0]; /* attribute name */
+};
+
+#define EXT4_XATTR_PAD_BITS 2
+#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS)
+#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1)
+#define EXT4_XATTR_LEN(name_len) \
+ (((name_len) + EXT4_XATTR_ROUND + \
+ sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
+#define EXT4_XATTR_NEXT(entry) \
+ ((struct ext4_xattr_entry *)( \
+ (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
+#define EXT4_XATTR_SIZE(size) \
+ (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
+
+#define IHDR(inode, raw_inode) \
+ ((struct ext4_xattr_ibody_header *) \
+ ((void *)raw_inode + \
+ EXT4_GOOD_OLD_INODE_SIZE + \
+ EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
+# ifdef CONFIG_EXT4_FS_XATTR
+
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
+
+extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
+
+extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext4_xattr_put_super(struct super_block *);
+
+extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle);
+
+extern int __init ext4_init_xattr(void);
+extern void ext4_exit_xattr(void);
+
+extern const struct xattr_handler *ext4_xattr_handlers[];
+
+# else /* CONFIG_EXT4_FS_XATTR */
+
+static inline int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+ const char *name, const void *value, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext4_xattr_put_super(struct super_block *sb)
+{
+}
+
+static __init inline int
+ext4_init_xattr(void)
+{
+ return 0;
+}
+
+static inline void
+ext4_exit_xattr(void)
+{
+}
+
+static inline int
+ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle)
+{
+ return -EOPNOTSUPP;
+}
+
+#define ext4_xattr_handlers NULL
+
+# endif /* CONFIG_EXT4_FS_XATTR */
+
+#ifdef CONFIG_EXT4_FS_SECURITY
+extern int ext4_init_security(handle_t *handle, struct inode *inode,
+ struct inode *dir, const struct qstr *qstr);
+#else
+static inline int ext4_init_security(handle_t *handle, struct inode *inode,
+ struct inode *dir, const struct qstr *qstr)
+{
+ return 0;
+}
+#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 00000000..d2a20062
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,82 @@
+/*
+ * linux/fs/ext4/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
+{
+ const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ name, value, size, flags);
+}
+
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ handle_t *handle = fs_info;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ err = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_SECURITY,
+ xattr->name, xattr->value,
+ xattr->value_len, 0);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(inode, dir, qstr,
+ &ext4_initxattrs, handle);
+}
+
+const struct xattr_handler ext4_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = ext4_xattr_security_list,
+ .get = ext4_xattr_security_get,
+ .set = ext4_xattr_security_set,
+};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 00000000..95f1f4ab
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,58 @@
+/*
+ * linux/fs/ext4/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
+{
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
+ size_t size, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = ext4_xattr_trusted_list,
+ .get = ext4_xattr_trusted_get,
+ .set = ext4_xattr_trusted_set,
+};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 00000000..0edb7611
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,61 @@
+/*
+ * linux/fs/ext4/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/string.h>
+#include <linux/fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+
+static size_t
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
+{
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (!test_opt(dentry->d_sb, XATTR_USER))
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (!test_opt(dentry->d_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (!test_opt(dentry->d_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = ext4_xattr_user_list,
+ .get = ext4_xattr_user_get,
+ .set = ext4_xattr_user_set,
+};